Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // Detecting mime types is a tricky business because we need to balance | 5 // Detecting mime types is a tricky business because we need to balance |
| 6 // compatibility concerns with security issues. Here is a survey of how other | 6 // compatibility concerns with security issues. Here is a survey of how other |
| 7 // browsers behave and then a description of how we intend to behave. | 7 // browsers behave and then a description of how we intend to behave. |
| 8 // | 8 // |
| 9 // HTML payload, no Content-Type header: | 9 // HTML payload, no Content-Type header: |
| 10 // * IE 7: Render as HTML | 10 // * IE 7: Render as HTML |
| (...skipping 99 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 110 static const size_t kBytesRequiredForMagic = 42; | 110 static const size_t kBytesRequiredForMagic = 42; |
| 111 | 111 |
| 112 struct MagicNumber { | 112 struct MagicNumber { |
| 113 const char* mime_type; | 113 const char* mime_type; |
| 114 const char* magic; | 114 const char* magic; |
| 115 size_t magic_len; | 115 size_t magic_len; |
| 116 bool is_string; | 116 bool is_string; |
| 117 const char* mask; // if set, must have same length as |magic| | 117 const char* mask; // if set, must have same length as |magic| |
| 118 }; | 118 }; |
| 119 | 119 |
| 120 #define MAGIC_NUMBER(mime_type, magic) \ | 120 #define MAGIC_NUMBER(mime_type, magic) \ |
| 121 { (mime_type), (magic), sizeof(magic)-1, false, NULL }, | 121 { (mime_type), (magic), sizeof(magic) - 1, false, NULL } \ |
| 122 , | |
|
mmenke
2014/10/10 18:12:39
Hrm...That comma change is really weird.
| |
| 122 | 123 |
| 123 template <int MagicSize, int MaskSize> | 124 template <int MagicSize, int MaskSize> |
| 124 class VerifySizes { | 125 class VerifySizes { |
| 125 COMPILE_ASSERT(MagicSize == MaskSize, sizes_must_be_equal); | 126 COMPILE_ASSERT(MagicSize == MaskSize, sizes_must_be_equal); |
| 127 | |
| 126 public: | 128 public: |
| 127 enum { SIZES = MagicSize }; | 129 enum { SIZES = MagicSize }; |
| 128 }; | 130 }; |
| 129 | 131 |
| 130 #define verified_sizeof(magic, mask) \ | 132 #define verified_sizeof(magic, mask) \ |
| 131 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES | 133 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES |
| 132 | 134 |
| 133 #define MAGIC_MASK(mime_type, magic, mask) \ | 135 #define MAGIC_MASK(mime_type, magic, mask) \ |
| 134 { (mime_type), (magic), verified_sizeof(magic, mask)-1, false, (mask) }, | 136 { (mime_type), (magic), verified_sizeof(magic, mask) - 1, false, (mask) } \ |
| 137 , | |
| 135 | 138 |
| 136 // Magic strings are case insensitive and must not include '\0' characters | 139 // Magic strings are case insensitive and must not include '\0' characters |
| 137 #define MAGIC_STRING(mime_type, magic) \ | 140 #define MAGIC_STRING(mime_type, magic) \ |
| 138 { (mime_type), (magic), sizeof(magic)-1, true, NULL }, | 141 { (mime_type), (magic), sizeof(magic) - 1, true, NULL } \ |
| 142 , | |
| 139 | 143 |
| 140 static const MagicNumber kMagicNumbers[] = { | 144 static const MagicNumber kMagicNumbers[] = { |
| 141 // Source: HTML 5 specification | 145 // Source: HTML 5 specification |
| 142 MAGIC_NUMBER("application/pdf", "%PDF-") | 146 MAGIC_NUMBER("application/pdf", "%PDF-") |
| 143 MAGIC_NUMBER("application/postscript", "%!PS-Adobe-") | 147 MAGIC_NUMBER("application/postscript", "%!PS-Adobe-") |
| 144 MAGIC_NUMBER("image/gif", "GIF87a") | 148 MAGIC_NUMBER("image/gif", "GIF87a") MAGIC_NUMBER("image/gif", "GIF89a") |
| 145 MAGIC_NUMBER("image/gif", "GIF89a") | 149 MAGIC_NUMBER("image/png", |
| 146 MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A") | 150 "\x89" |
| 147 MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF") | 151 "PNG\x0D\x0A\x1A\x0A") |
| 148 MAGIC_NUMBER("image/bmp", "BM") | 152 MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF") MAGIC_NUMBER("image/bmp", "BM") |
| 149 // Source: Mozilla | 153 // Source: Mozilla |
| 150 MAGIC_NUMBER("text/plain", "#!") // Script | 154 MAGIC_NUMBER("text/plain", "#!") // Script |
| 151 MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS | 155 MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS |
| 152 MAGIC_NUMBER("text/plain", "From") | 156 MAGIC_NUMBER("text/plain", "From") MAGIC_NUMBER("text/plain", ">From") |
| 153 MAGIC_NUMBER("text/plain", ">From") | 157 // Chrome specific |
| 154 // Chrome specific | 158 MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08") |
| 155 MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08") | 159 MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46") MAGIC_NUMBER( |
| 156 MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46") | 160 "video/x-ms-asf", |
| 157 MAGIC_NUMBER("video/x-ms-asf", | 161 "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C") |
| 158 "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C") | 162 MAGIC_NUMBER("image/tiff", "I I") MAGIC_NUMBER("image/tiff", "II*") |
| 159 MAGIC_NUMBER("image/tiff", "I I") | 163 MAGIC_NUMBER("image/tiff", "MM\x00*") MAGIC_NUMBER("audio/mpeg", "ID3") |
| 160 MAGIC_NUMBER("image/tiff", "II*") | 164 MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ") |
| 161 MAGIC_NUMBER("image/tiff", "MM\x00*") | 165 MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3") |
| 162 MAGIC_NUMBER("audio/mpeg", "ID3") | 166 // TODO(abarth): we don't handle partial byte matches yet |
| 163 MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ") | 167 // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB") |
| 164 MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3") | 168 // MAGIC_NUMBER("audio/mpeg", "\xFF\xE") |
| 165 // TODO(abarth): we don't handle partial byte matches yet | 169 // MAGIC_NUMBER("audio/mpeg", "\xFF\xF") |
| 166 // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB") | 170 MAGIC_NUMBER("application/zip", "PK\x03\x04") |
| 167 // MAGIC_NUMBER("audio/mpeg", "\xFF\xE") | 171 MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00") |
| 168 // MAGIC_NUMBER("audio/mpeg", "\xFF\xF") | 172 MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A") |
| 169 MAGIC_NUMBER("application/zip", "PK\x03\x04") | 173 MAGIC_NUMBER("application/octet-stream", "MZ") // EXE |
| 170 MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00") | 174 // Sniffing for Flash: |
| 171 MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A") | 175 // |
| 172 MAGIC_NUMBER("application/octet-stream", "MZ") // EXE | 176 // MAGIC_NUMBER("application/x-shockwave-flash", "CWS") |
| 173 // Sniffing for Flash: | 177 // MAGIC_NUMBER("application/x-shockwave-flash", "FLV") |
| 174 // | 178 // MAGIC_NUMBER("application/x-shockwave-flash", "FWS") |
| 175 // MAGIC_NUMBER("application/x-shockwave-flash", "CWS") | 179 // |
| 176 // MAGIC_NUMBER("application/x-shockwave-flash", "FLV") | 180 // Including these magic number for Flash is a trade off. |
| 177 // MAGIC_NUMBER("application/x-shockwave-flash", "FWS") | 181 // |
| 178 // | 182 // Pros: |
| 179 // Including these magic number for Flash is a trade off. | 183 // * Flash is an important and popular file format |
| 180 // | 184 // |
| 181 // Pros: | 185 // Cons: |
| 182 // * Flash is an important and popular file format | 186 // * These patterns are fairly weak |
| 183 // | 187 // * If we mistakenly decide something is Flash, we will execute it |
| 184 // Cons: | 188 // in the origin of an unsuspecting site. This could be a security |
| 185 // * These patterns are fairly weak | 189 // vulnerability if the site allows users to upload content. |
| 186 // * If we mistakenly decide something is Flash, we will execute it | 190 // |
| 187 // in the origin of an unsuspecting site. This could be a security | 191 // On balance, we do not include these patterns. |
| 188 // vulnerability if the site allows users to upload content. | |
| 189 // | |
| 190 // On balance, we do not include these patterns. | |
| 191 }; | 192 }; |
| 192 | 193 |
| 193 // The number of content bytes we need to use all our Microsoft Office magic | 194 // The number of content bytes we need to use all our Microsoft Office magic |
| 194 // numbers. | 195 // numbers. |
| 195 static const size_t kBytesRequiredForOfficeMagic = 8; | 196 static const size_t kBytesRequiredForOfficeMagic = 8; |
| 196 | 197 |
| 197 static const MagicNumber kOfficeMagicNumbers[] = { | 198 static const MagicNumber kOfficeMagicNumbers[] = { |
| 198 MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1") | 199 MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1") |
| 199 MAGIC_NUMBER("OOXML", "PK\x03\x04") | 200 MAGIC_NUMBER("OOXML", "PK\x03\x04")}; |
|
mmenke
2014/10/10 18:12:39
Not putting the close brace on its own line seems
| |
| 200 }; | |
| 201 | 201 |
| 202 enum OfficeDocType { | 202 enum OfficeDocType { |
| 203 DOC_TYPE_WORD, | 203 DOC_TYPE_WORD, |
| 204 DOC_TYPE_EXCEL, | 204 DOC_TYPE_EXCEL, |
| 205 DOC_TYPE_POWERPOINT, | 205 DOC_TYPE_POWERPOINT, |
| 206 DOC_TYPE_NONE | 206 DOC_TYPE_NONE |
| 207 }; | 207 }; |
| 208 | 208 |
| 209 struct OfficeExtensionType { | 209 struct OfficeExtensionType { |
| 210 OfficeDocType doc_type; | 210 OfficeDocType doc_type; |
| 211 const char* extension; | 211 const char* extension; |
| 212 size_t extension_len; | 212 size_t extension_len; |
| 213 }; | 213 }; |
| 214 | 214 |
| 215 #define OFFICE_EXTENSION(type, extension) \ | 215 #define OFFICE_EXTENSION(type, extension) \ |
| 216 { (type), (extension), sizeof(extension) - 1 }, | 216 { (type), (extension), sizeof(extension) - 1 } \ |
| 217 , | |
| 217 | 218 |
| 218 static const OfficeExtensionType kOfficeExtensionTypes[] = { | 219 static const OfficeExtensionType kOfficeExtensionTypes[] = { |
| 219 OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc") | 220 OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc") |
| 220 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls") | 221 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls") |
| 221 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt") | 222 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt") |
| 222 OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx") | 223 OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx") |
| 223 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx") | 224 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx") |
| 224 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx") | 225 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx")}; |
| 225 }; | |
| 226 | 226 |
| 227 static const MagicNumber kExtraMagicNumbers[] = { | 227 static const MagicNumber kExtraMagicNumbers[] = { |
| 228 MAGIC_NUMBER("image/x-xbitmap", "#define") | 228 MAGIC_NUMBER("image/x-xbitmap", "#define") |
| 229 MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00") | 229 MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00") |
| 230 MAGIC_NUMBER("image/svg+xml", "<?xml_version=") | 230 MAGIC_NUMBER("image/svg+xml", "<?xml_version=") |
| 231 MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt ") | 231 MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt ") |
| 232 MAGIC_NUMBER("video/avi", "RIFF....AVI LIST") | 232 MAGIC_NUMBER("video/avi", "RIFF....AVI LIST") |
| 233 MAGIC_NUMBER("audio/ogg", "OggS") | 233 MAGIC_NUMBER("audio/ogg", "OggS") |
| 234 MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0") | 234 MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0") |
| 235 MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0") | 235 MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0") |
| 236 MAGIC_NUMBER("video/3gpp", "....ftyp3g") | 236 MAGIC_NUMBER("video/3gpp", "....ftyp3g") |
| 237 MAGIC_NUMBER("video/3gpp", "....ftypavcl") | 237 MAGIC_NUMBER("video/3gpp", "....ftypavcl") |
| 238 MAGIC_NUMBER("video/mp4", "....ftyp") | 238 MAGIC_NUMBER("video/mp4", "....ftyp") |
| 239 MAGIC_NUMBER("video/quicktime", "....moov") | 239 MAGIC_NUMBER("video/quicktime", "....moov") |
| 240 MAGIC_NUMBER("application/x-shockwave-flash", "CWS") | 240 MAGIC_NUMBER("application/x-shockwave-flash", "CWS") |
| 241 MAGIC_NUMBER("application/x-shockwave-flash", "FWS") | 241 MAGIC_NUMBER("application/x-shockwave-flash", "FWS") |
| 242 MAGIC_NUMBER("video/x-flv", "FLV") | 242 MAGIC_NUMBER("video/x-flv", "FLV") MAGIC_NUMBER("audio/x-flac", "fLaC") |
| 243 MAGIC_NUMBER("audio/x-flac", "fLaC") | |
| 244 | 243 |
| 245 // RAW image types. | 244 // RAW image types. |
| 246 MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR") | 245 MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR") |
| 247 MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR") | 246 MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR") |
| 248 MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM") | 247 MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM") |
| 249 MAGIC_NUMBER("image/x-olympus-orf", "MMOR") // big-endian | 248 MAGIC_NUMBER("image/x-olympus-orf", "MMOR") // big-endian |
| 250 MAGIC_NUMBER("image/x-olympus-orf", "IIRO") // little-endian | 249 MAGIC_NUMBER("image/x-olympus-orf", "IIRO") // little-endian |
| 251 MAGIC_NUMBER("image/x-olympus-orf", "IIRS") // little-endian | 250 MAGIC_NUMBER("image/x-olympus-orf", "IIRS") // little-endian |
| 252 MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW ") | 251 MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW ") |
| 253 MAGIC_NUMBER("image/x-panasonic-raw", | 252 MAGIC_NUMBER("image/x-panasonic-raw", |
| 254 "IIU\x00\x08\x00\x00\x00") // Panasonic .raw | 253 "IIU\x00\x08\x00\x00\x00") // Panasonic .raw |
| 255 MAGIC_NUMBER("image/x-panasonic-raw", | 254 MAGIC_NUMBER("image/x-panasonic-raw", |
| 256 "IIU\x00\x18\x00\x00\x00") // Panasonic .rw2 | 255 "IIU\x00\x18\x00\x00\x00") // Panasonic .rw2 |
| 257 MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw") | 256 MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw") |
| 258 MAGIC_NUMBER("image/x-x3f", "FOVb") | 257 MAGIC_NUMBER("image/x-x3f", "FOVb")}; |
| 259 }; | |
| 260 | 258 |
| 261 // Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will | 259 // Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will |
| 262 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is | 260 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is |
| 263 // HTML, but we will not. | 261 // HTML, but we will not. |
| 264 | 262 |
| 265 #define MAGIC_HTML_TAG(tag) \ | 263 #define MAGIC_HTML_TAG(tag) MAGIC_STRING("text/html", "<" tag) |
| 266 MAGIC_STRING("text/html", "<" tag) | |
| 267 | 264 |
| 268 static const MagicNumber kSniffableTags[] = { | 265 static const MagicNumber kSniffableTags[] = { |
| 269 // XML processing directive. Although this is not an HTML mime type, we sniff | 266 // XML processing directive. Although this is not an HTML mime type, we |
| 270 // for this in the HTML phase because text/xml is just as powerful as HTML and | 267 // sniff |
| 271 // we want to leverage our white space skipping technology. | 268 // for this in the HTML phase because text/xml is just as powerful as HTML |
| 272 MAGIC_NUMBER("text/xml", "<?xml") // Mozilla | 269 // and |
| 273 // DOCTYPEs | 270 // we want to leverage our white space skipping technology. |
| 274 MAGIC_HTML_TAG("!DOCTYPE html") // HTML5 spec | 271 MAGIC_NUMBER("text/xml", "<?xml") // Mozilla |
| 275 // Sniffable tags, ordered by how often they occur in sniffable documents. | 272 // DOCTYPEs |
| 276 MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla | 273 MAGIC_HTML_TAG("!DOCTYPE html") // HTML5 spec |
| 277 MAGIC_HTML_TAG("html") // HTML5 spec, Mozilla | 274 // Sniffable tags, ordered by how often they occur in sniffable documents. |
| 278 MAGIC_HTML_TAG("!--") | 275 MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla |
| 279 MAGIC_HTML_TAG("head") // HTML5 spec, Mozilla | 276 MAGIC_HTML_TAG("html") // HTML5 spec, Mozilla |
| 280 MAGIC_HTML_TAG("iframe") // Mozilla | 277 MAGIC_HTML_TAG("!--") MAGIC_HTML_TAG("head") // HTML5 spec, Mozilla |
| 281 MAGIC_HTML_TAG("h1") // Mozilla | 278 MAGIC_HTML_TAG("iframe") // Mozilla |
| 282 MAGIC_HTML_TAG("div") // Mozilla | 279 MAGIC_HTML_TAG("h1") // Mozilla |
| 283 MAGIC_HTML_TAG("font") // Mozilla | 280 MAGIC_HTML_TAG("div") // Mozilla |
| 284 MAGIC_HTML_TAG("table") // Mozilla | 281 MAGIC_HTML_TAG("font") // Mozilla |
| 285 MAGIC_HTML_TAG("a") // Mozilla | 282 MAGIC_HTML_TAG("table") // Mozilla |
| 286 MAGIC_HTML_TAG("style") // Mozilla | 283 MAGIC_HTML_TAG("a") // Mozilla |
| 287 MAGIC_HTML_TAG("title") // Mozilla | 284 MAGIC_HTML_TAG("style") // Mozilla |
| 288 MAGIC_HTML_TAG("b") // Mozilla | 285 MAGIC_HTML_TAG("title") // Mozilla |
| 289 MAGIC_HTML_TAG("body") // Mozilla | 286 MAGIC_HTML_TAG("b") // Mozilla |
| 290 MAGIC_HTML_TAG("br") | 287 MAGIC_HTML_TAG("body") // Mozilla |
| 291 MAGIC_HTML_TAG("p") // Mozilla | 288 MAGIC_HTML_TAG("br") MAGIC_HTML_TAG("p") // Mozilla |
| 292 }; | 289 }; |
| 293 | 290 |
| 294 static base::HistogramBase* UMASnifferHistogramGet(const char* name, | 291 static base::HistogramBase* UMASnifferHistogramGet(const char* name, |
| 295 int array_size) { | 292 int array_size) { |
| 296 base::HistogramBase* counter = | 293 base::HistogramBase* counter = base::LinearHistogram::FactoryGet( |
| 297 base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size, | 294 name, |
| 298 base::HistogramBase::kUmaTargetedHistogramFlag); | 295 1, |
| 296 array_size - 1, | |
| 297 array_size, | |
| 298 base::HistogramBase::kUmaTargetedHistogramFlag); | |
| 299 return counter; | 299 return counter; |
| 300 } | 300 } |
| 301 | 301 |
| 302 // Compare content header to a magic number where magic_entry can contain '.' | 302 // Compare content header to a magic number where magic_entry can contain '.' |
| 303 // for single character of anything, allowing some bytes to be skipped. | 303 // for single character of anything, allowing some bytes to be skipped. |
| 304 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) { | 304 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) { |
| 305 while (len) { | 305 while (len) { |
| 306 if ((*magic_entry != '.') && (*magic_entry != *content)) | 306 if ((*magic_entry != '.') && (*magic_entry != *content)) |
| 307 return false; | 307 return false; |
| 308 ++magic_entry; | 308 ++magic_entry; |
| (...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 361 } | 361 } |
| 362 } | 362 } |
| 363 | 363 |
| 364 if (match) { | 364 if (match) { |
| 365 result->assign(magic_entry.mime_type); | 365 result->assign(magic_entry.mime_type); |
| 366 return true; | 366 return true; |
| 367 } | 367 } |
| 368 return false; | 368 return false; |
| 369 } | 369 } |
| 370 | 370 |
| 371 static bool CheckForMagicNumbers(const char* content, size_t size, | 371 static bool CheckForMagicNumbers(const char* content, |
| 372 const MagicNumber* magic, size_t magic_len, | 372 size_t size, |
| 373 const MagicNumber* magic, | |
| 374 size_t magic_len, | |
| 373 base::HistogramBase* counter, | 375 base::HistogramBase* counter, |
| 374 std::string* result) { | 376 std::string* result) { |
| 375 for (size_t i = 0; i < magic_len; ++i) { | 377 for (size_t i = 0; i < magic_len; ++i) { |
| 376 if (MatchMagicNumber(content, size, magic[i], result)) { | 378 if (MatchMagicNumber(content, size, magic[i], result)) { |
| 377 if (counter) counter->Add(static_cast<int>(i)); | 379 if (counter) |
| 380 counter->Add(static_cast<int>(i)); | |
| 378 return true; | 381 return true; |
| 379 } | 382 } |
| 380 } | 383 } |
| 381 return false; | 384 return false; |
| 382 } | 385 } |
| 383 | 386 |
| 384 // Truncates |size| to |max_size| and returns true if |size| is at least | 387 // Truncates |size| to |max_size| and returns true if |size| is at least |
| 385 // |max_size|. | 388 // |max_size|. |
| 386 static bool TruncateSize(const size_t max_size, size_t* size) { | 389 static bool TruncateSize(const size_t max_size, size_t* size) { |
| 387 // Keep kMaxBytesToSniff honest. | 390 // Keep kMaxBytesToSniff honest. |
| (...skipping 23 matching lines...) Expand all Loading... | |
| 411 for (pos = content; pos < end; ++pos) { | 414 for (pos = content; pos < end; ++pos) { |
| 412 if (!IsAsciiWhitespace(*pos)) | 415 if (!IsAsciiWhitespace(*pos)) |
| 413 break; | 416 break; |
| 414 } | 417 } |
| 415 static base::HistogramBase* counter(NULL); | 418 static base::HistogramBase* counter(NULL); |
| 416 if (!counter) { | 419 if (!counter) { |
| 417 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2", | 420 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2", |
| 418 arraysize(kSniffableTags)); | 421 arraysize(kSniffableTags)); |
| 419 } | 422 } |
| 420 // |pos| now points to first non-whitespace character (or at end). | 423 // |pos| now points to first non-whitespace character (or at end). |
| 421 return CheckForMagicNumbers(pos, end - pos, | 424 return CheckForMagicNumbers(pos, |
| 422 kSniffableTags, arraysize(kSniffableTags), | 425 end - pos, |
| 423 counter, result); | 426 kSniffableTags, |
| 427 arraysize(kSniffableTags), | |
| 428 counter, | |
| 429 result); | |
| 424 } | 430 } |
| 425 | 431 |
| 426 // Returns true and sets result if the content matches any of kMagicNumbers. | 432 // Returns true and sets result if the content matches any of kMagicNumbers. |
| 427 // Clears have_enough_content if more data could possibly change the result. | 433 // Clears have_enough_content if more data could possibly change the result. |
| 428 static bool SniffForMagicNumbers(const char* content, | 434 static bool SniffForMagicNumbers(const char* content, |
| 429 size_t size, | 435 size_t size, |
| 430 bool* have_enough_content, | 436 bool* have_enough_content, |
| 431 std::string* result) { | 437 std::string* result) { |
| 432 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); | 438 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); |
| 433 | 439 |
| 434 // Check our big table of Magic Numbers | 440 // Check our big table of Magic Numbers |
| 435 static base::HistogramBase* counter(NULL); | 441 static base::HistogramBase* counter(NULL); |
| 436 if (!counter) { | 442 if (!counter) { |
| 437 counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2", | 443 counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2", |
| 438 arraysize(kMagicNumbers)); | 444 arraysize(kMagicNumbers)); |
| 439 } | 445 } |
| 440 return CheckForMagicNumbers(content, size, | 446 return CheckForMagicNumbers( |
| 441 kMagicNumbers, arraysize(kMagicNumbers), | 447 content, size, kMagicNumbers, arraysize(kMagicNumbers), counter, result); |
| 442 counter, result); | |
| 443 } | 448 } |
| 444 | 449 |
| 445 // Returns true and sets result if the content matches any of | 450 // Returns true and sets result if the content matches any of |
| 446 // kOfficeMagicNumbers, and the URL has the proper extension. | 451 // kOfficeMagicNumbers, and the URL has the proper extension. |
| 447 // Clears |have_enough_content| if more data could possibly change the result. | 452 // Clears |have_enough_content| if more data could possibly change the result. |
| 448 static bool SniffForOfficeDocs(const char* content, | 453 static bool SniffForOfficeDocs(const char* content, |
| 449 size_t size, | 454 size_t size, |
| 450 const GURL& url, | 455 const GURL& url, |
| 451 bool* have_enough_content, | 456 bool* have_enough_content, |
| 452 std::string* result) { | 457 std::string* result) { |
| 453 *have_enough_content &= TruncateSize(kBytesRequiredForOfficeMagic, &size); | 458 *have_enough_content &= TruncateSize(kBytesRequiredForOfficeMagic, &size); |
| 454 | 459 |
| 455 // Check our table of magic numbers for Office file types. | 460 // Check our table of magic numbers for Office file types. |
| 456 std::string office_version; | 461 std::string office_version; |
| 457 if (!CheckForMagicNumbers(content, size, | 462 if (!CheckForMagicNumbers(content, |
| 458 kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers), | 463 size, |
| 459 NULL, &office_version)) | 464 kOfficeMagicNumbers, |
| 465 arraysize(kOfficeMagicNumbers), | |
| 466 NULL, | |
| 467 &office_version)) | |
| 460 return false; | 468 return false; |
| 461 | 469 |
| 462 OfficeDocType type = DOC_TYPE_NONE; | 470 OfficeDocType type = DOC_TYPE_NONE; |
| 463 for (size_t i = 0; i < arraysize(kOfficeExtensionTypes); ++i) { | 471 for (size_t i = 0; i < arraysize(kOfficeExtensionTypes); ++i) { |
| 464 std::string url_path = url.path(); | 472 std::string url_path = url.path(); |
| 465 | 473 |
| 466 if (url_path.length() < kOfficeExtensionTypes[i].extension_len) | 474 if (url_path.length() < kOfficeExtensionTypes[i].extension_len) |
| 467 continue; | 475 continue; |
| 468 | 476 |
| 469 const char* extension = | 477 const char* extension = |
| 470 &url_path[url_path.length() - kOfficeExtensionTypes[i].extension_len]; | 478 &url_path[url_path.length() - kOfficeExtensionTypes[i].extension_len]; |
| 471 | 479 |
| 472 if (0 == base::strncasecmp(extension, kOfficeExtensionTypes[i].extension, | 480 if (0 == base::strncasecmp(extension, |
| 481 kOfficeExtensionTypes[i].extension, | |
| 473 kOfficeExtensionTypes[i].extension_len)) { | 482 kOfficeExtensionTypes[i].extension_len)) { |
| 474 type = kOfficeExtensionTypes[i].doc_type; | 483 type = kOfficeExtensionTypes[i].doc_type; |
| 475 break; | 484 break; |
| 476 } | 485 } |
| 477 } | 486 } |
| 478 | 487 |
| 479 if (type == DOC_TYPE_NONE) | 488 if (type == DOC_TYPE_NONE) |
| 480 return false; | 489 return false; |
| 481 | 490 |
| 482 if (office_version == "CFB") { | 491 if (office_version == "CFB") { |
| 483 switch (type) { | 492 switch (type) { |
| 484 case DOC_TYPE_WORD: | 493 case DOC_TYPE_WORD: |
| 485 *result = "application/msword"; | 494 *result = "application/msword"; |
| 486 return true; | 495 return true; |
| 487 case DOC_TYPE_EXCEL: | 496 case DOC_TYPE_EXCEL: |
| 488 *result = "application/vnd.ms-excel"; | 497 *result = "application/vnd.ms-excel"; |
| 489 return true; | 498 return true; |
| 490 case DOC_TYPE_POWERPOINT: | 499 case DOC_TYPE_POWERPOINT: |
| 491 *result = "application/vnd.ms-powerpoint"; | 500 *result = "application/vnd.ms-powerpoint"; |
| 492 return true; | 501 return true; |
| 493 case DOC_TYPE_NONE: | 502 case DOC_TYPE_NONE: |
| 494 NOTREACHED(); | 503 NOTREACHED(); |
| 495 return false; | 504 return false; |
| 496 } | 505 } |
| 497 } else if (office_version == "OOXML") { | 506 } else if (office_version == "OOXML") { |
| 498 switch (type) { | 507 switch (type) { |
| 499 case DOC_TYPE_WORD: | 508 case DOC_TYPE_WORD: |
| 500 *result = "application/vnd.openxmlformats-officedocument." | 509 *result = |
| 501 "wordprocessingml.document"; | 510 "application/vnd.openxmlformats-officedocument." |
| 511 "wordprocessingml.document"; | |
| 502 return true; | 512 return true; |
| 503 case DOC_TYPE_EXCEL: | 513 case DOC_TYPE_EXCEL: |
| 504 *result = "application/vnd.openxmlformats-officedocument." | 514 *result = |
| 505 "spreadsheetml.sheet"; | 515 "application/vnd.openxmlformats-officedocument." |
| 516 "spreadsheetml.sheet"; | |
| 506 return true; | 517 return true; |
| 507 case DOC_TYPE_POWERPOINT: | 518 case DOC_TYPE_POWERPOINT: |
| 508 *result = "application/vnd.openxmlformats-officedocument." | 519 *result = |
| 509 "presentationml.presentation"; | 520 "application/vnd.openxmlformats-officedocument." |
| 521 "presentationml.presentation"; | |
| 510 return true; | 522 return true; |
| 511 case DOC_TYPE_NONE: | 523 case DOC_TYPE_NONE: |
| 512 NOTREACHED(); | 524 NOTREACHED(); |
| 513 return false; | 525 return false; |
| 514 } | 526 } |
| 515 } | 527 } |
| 516 | 528 |
| 517 NOTREACHED(); | 529 NOTREACHED(); |
| 518 return false; | 530 return false; |
| 519 } | 531 } |
| 520 | 532 |
| 521 static bool IsOfficeType(const std::string& type_hint) { | 533 static bool IsOfficeType(const std::string& type_hint) { |
| 522 return (type_hint == "application/msword" || | 534 return (type_hint == "application/msword" || |
| 523 type_hint == "application/vnd.ms-excel" || | 535 type_hint == "application/vnd.ms-excel" || |
| 524 type_hint == "application/vnd.ms-powerpoint" || | 536 type_hint == "application/vnd.ms-powerpoint" || |
| 525 type_hint == "application/vnd.openxmlformats-officedocument." | 537 type_hint == |
| 526 "wordprocessingml.document" || | 538 "application/vnd.openxmlformats-officedocument." |
| 527 type_hint == "application/vnd.openxmlformats-officedocument." | 539 "wordprocessingml.document" || |
| 528 "spreadsheetml.sheet" || | 540 type_hint == |
| 529 type_hint == "application/vnd.openxmlformats-officedocument." | 541 "application/vnd.openxmlformats-officedocument." |
| 530 "presentationml.presentation" || | 542 "spreadsheetml.sheet" || |
| 543 type_hint == | |
| 544 "application/vnd.openxmlformats-officedocument." | |
| 545 "presentationml.presentation" || | |
| 531 type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" || | 546 type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" || |
| 532 type_hint == "application/vnd.ms-word.document.macroenabled.12" || | 547 type_hint == "application/vnd.ms-word.document.macroenabled.12" || |
| 533 type_hint == "application/vnd.ms-powerpoint.presentation." | 548 type_hint == |
| 534 "macroenabled.12" || | 549 "application/vnd.ms-powerpoint.presentation." |
| 550 "macroenabled.12" || | |
| 535 type_hint == "application/mspowerpoint" || | 551 type_hint == "application/mspowerpoint" || |
| 536 type_hint == "application/msexcel" || | 552 type_hint == "application/msexcel" || |
| 537 type_hint == "application/vnd.ms-word" || | 553 type_hint == "application/vnd.ms-word" || |
| 538 type_hint == "application/vnd.ms-word.document.12" || | 554 type_hint == "application/vnd.ms-word.document.12" || |
| 539 type_hint == "application/vnd.msword"); | 555 type_hint == "application/vnd.msword"); |
| 540 } | 556 } |
| 541 | 557 |
| 542 // This function checks for files that have a Microsoft Office MIME type | 558 // This function checks for files that have a Microsoft Office MIME type |
| 543 // set, but are not actually Office files. | 559 // set, but are not actually Office files. |
| 544 // | 560 // |
| 545 // If this is not actually an Office file, |*result| is set to | 561 // If this is not actually an Office file, |*result| is set to |
| 546 // "application/octet-stream", otherwise it is not modified. | 562 // "application/octet-stream", otherwise it is not modified. |
| 547 // | 563 // |
| 548 // Returns false if additional data is required to determine the file type, or | 564 // Returns false if additional data is required to determine the file type, or |
| 549 // true if there is enough data to make a decision. | 565 // true if there is enough data to make a decision. |
| 550 static bool SniffForInvalidOfficeDocs(const char* content, | 566 static bool SniffForInvalidOfficeDocs(const char* content, |
| 551 size_t size, | 567 size_t size, |
| 552 const GURL& url, | 568 const GURL& url, |
| 553 std::string* result) { | 569 std::string* result) { |
| 554 if (!TruncateSize(kBytesRequiredForOfficeMagic, &size)) | 570 if (!TruncateSize(kBytesRequiredForOfficeMagic, &size)) |
| 555 return false; | 571 return false; |
| 556 | 572 |
| 557 // Check our table of magic numbers for Office file types. If it does not | 573 // Check our table of magic numbers for Office file types. If it does not |
| 558 // match one, the MIME type was invalid. Set it instead to a safe value. | 574 // match one, the MIME type was invalid. Set it instead to a safe value. |
| 559 std::string office_version; | 575 std::string office_version; |
| 560 if (!CheckForMagicNumbers(content, size, | 576 if (!CheckForMagicNumbers(content, |
| 561 kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers), | 577 size, |
| 562 NULL, &office_version)) { | 578 kOfficeMagicNumbers, |
| 579 arraysize(kOfficeMagicNumbers), | |
| 580 NULL, | |
| 581 &office_version)) { | |
| 563 *result = "application/octet-stream"; | 582 *result = "application/octet-stream"; |
| 564 } | 583 } |
| 565 | 584 |
| 566 // We have enough information to determine if this was a Microsoft Office | 585 // We have enough information to determine if this was a Microsoft Office |
| 567 // document or not, so sniffing is completed. | 586 // document or not, so sniffing is completed. |
| 568 return true; | 587 return true; |
| 569 } | 588 } |
| 570 | 589 |
| 571 // Byte order marks | 590 // Byte order marks |
| 572 static const MagicNumber kMagicXML[] = { | 591 static const MagicNumber kMagicXML[] = { |
| 573 // We want to be very conservative in interpreting text/xml content as | 592 // We want to be very conservative in interpreting text/xml content as |
| 574 // XHTML -- we just want to sniff enough to make unit tests pass. | 593 // XHTML -- we just want to sniff enough to make unit tests pass. |
| 575 // So we match explicitly on this, and don't match other ways of writing | 594 // So we match explicitly on this, and don't match other ways of writing |
| 576 // it in semantically-equivalent ways. | 595 // it in semantically-equivalent ways. |
| 577 MAGIC_STRING("application/xhtml+xml", | 596 MAGIC_STRING("application/xhtml+xml", |
| 578 "<html xmlns=\"http://www.w3.org/1999/xhtml\"") | 597 "<html xmlns=\"http://www.w3.org/1999/xhtml\"") |
| 579 MAGIC_STRING("application/atom+xml", "<feed") | 598 MAGIC_STRING("application/atom+xml", "<feed") |
| 580 MAGIC_STRING("application/rss+xml", "<rss") // UTF-8 | 599 MAGIC_STRING("application/rss+xml", "<rss") // UTF-8 |
| 581 }; | 600 }; |
| 582 | 601 |
| 583 // Returns true and sets result if the content appears to contain XHTML or a | 602 // Returns true and sets result if the content appears to contain XHTML or a |
| 584 // feed. | 603 // feed. |
| 585 // Clears have_enough_content if more data could possibly change the result. | 604 // Clears have_enough_content if more data could possibly change the result. |
| 586 // | 605 // |
| 587 // TODO(evanm): this is similar but more conservative than what Safari does, | 606 // TODO(evanm): this is similar but more conservative than what Safari does, |
| 588 // while HTML5 has a different recommendation -- what should we do? | 607 // while HTML5 has a different recommendation -- what should we do? |
| 589 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset | 608 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset |
| 590 // of ASCII -- do we care? | 609 // of ASCII -- do we care? |
| 591 static bool SniffXML(const char* content, | 610 static bool SniffXML(const char* content, |
| 592 size_t size, | 611 size_t size, |
| 593 bool* have_enough_content, | 612 bool* have_enough_content, |
| 594 std::string* result) { | 613 std::string* result) { |
| 595 // We allow at most 300 bytes of content before we expect the opening tag. | 614 // We allow at most 300 bytes of content before we expect the opening tag. |
| 596 *have_enough_content &= TruncateSize(300, &size); | 615 *have_enough_content &= TruncateSize(300, &size); |
| 597 const char* pos = content; | 616 const char* pos = content; |
| 598 const char* const end = content + size; | 617 const char* const end = content + size; |
| 599 | 618 |
| 600 // This loop iterates through tag-looking offsets in the file. | 619 // This loop iterates through tag-looking offsets in the file. |
| 601 // We want to skip XML processing instructions (of the form "<?xml ...") | 620 // We want to skip XML processing instructions (of the form "<?xml ...") |
| 602 // and stop at the first "plain" tag, then make a decision on the mime-type | 621 // and stop at the first "plain" tag, then make a decision on the mime-type |
| 603 // based on the name (or possibly attributes) of that tag. | 622 // based on the name (or possibly attributes) of that tag. |
| 604 static base::HistogramBase* counter(NULL); | 623 static base::HistogramBase* counter(NULL); |
| 605 if (!counter) { | 624 if (!counter) { |
| 606 counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2", | 625 counter = |
| 607 arraysize(kMagicXML)); | 626 UMASnifferHistogramGet("mime_sniffer.kMagicXML2", arraysize(kMagicXML)); |
| 608 } | 627 } |
| 609 const int kMaxTagIterations = 5; | 628 const int kMaxTagIterations = 5; |
| 610 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { | 629 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { |
| 611 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos)); | 630 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos)); |
| 612 if (!pos) | 631 if (!pos) |
| 613 return false; | 632 return false; |
| 614 | 633 |
| 615 if (base::strncasecmp(pos, "<?xml", sizeof("<?xml") - 1) == 0) { | 634 if (base::strncasecmp(pos, "<?xml", sizeof("<?xml") - 1) == 0) { |
| 616 // Skip XML declarations. | 635 // Skip XML declarations. |
| 617 ++pos; | 636 ++pos; |
| 618 continue; | 637 continue; |
| 619 } else if (base::strncasecmp(pos, "<!DOCTYPE", | 638 } else if (base::strncasecmp(pos, "<!DOCTYPE", sizeof("<!DOCTYPE") - 1) == |
| 620 sizeof("<!DOCTYPE") - 1) == 0) { | 639 0) { |
|
mmenke
2014/10/10 18:12:39
Think this is pretty ugly - I find no extra indent
| |
| 621 // Skip DOCTYPE declarations. | 640 // Skip DOCTYPE declarations. |
| 622 ++pos; | 641 ++pos; |
| 623 continue; | 642 continue; |
| 624 } | 643 } |
| 625 | 644 |
| 626 if (CheckForMagicNumbers(pos, end - pos, | 645 if (CheckForMagicNumbers( |
| 627 kMagicXML, arraysize(kMagicXML), | 646 pos, end - pos, kMagicXML, arraysize(kMagicXML), counter, result)) |
| 628 counter, result)) | |
| 629 return true; | 647 return true; |
|
mmenke
2014/10/10 18:12:39
This is a style violation - when an if body takes
| |
| 630 | 648 |
| 631 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult | 649 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult |
| 632 // to identify. | 650 // to identify. |
| 633 | 651 |
| 634 // If we get here, we've hit an initial tag that hasn't matched one of the | 652 // If we get here, we've hit an initial tag that hasn't matched one of the |
| 635 // above tests. Abort. | 653 // above tests. Abort. |
| 636 return true; | 654 return true; |
| 637 } | 655 } |
| 638 | 656 |
| 639 // We iterated too far without finding a start tag. | 657 // We iterated too far without finding a start tag. |
| 640 // If we have more content to look at, we aren't going to change our mind by | 658 // If we have more content to look at, we aren't going to change our mind by |
| 641 // seeing more bytes from the network. | 659 // seeing more bytes from the network. |
| 642 return pos < end; | 660 return pos < end; |
| 643 } | 661 } |
| 644 | 662 |
| 645 // Byte order marks | 663 // Byte order marks |
| 646 static const MagicNumber kByteOrderMark[] = { | 664 static const MagicNumber kByteOrderMark[] = { |
| 647 MAGIC_NUMBER("text/plain", "\xFE\xFF") // UTF-16BE | 665 MAGIC_NUMBER("text/plain", "\xFE\xFF") // UTF-16BE |
| 648 MAGIC_NUMBER("text/plain", "\xFF\xFE") // UTF-16LE | 666 MAGIC_NUMBER("text/plain", "\xFF\xFE") // UTF-16LE |
| 649 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8 | 667 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8 |
| 650 }; | 668 }; |
| 651 | 669 |
| 652 // Whether a given byte looks like it might be part of binary content. | 670 // Whether a given byte looks like it might be part of binary content. |
| 653 // Source: HTML5 spec | 671 // Source: HTML5 spec |
| 654 static char kByteLooksBinary[] = { | 672 static char kByteLooksBinary[] = { |
| 655 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F | 673 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F |
| 656 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F | 674 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F |
| 657 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F | 675 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F |
| 658 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F | 676 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F |
| 659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40 - 0x4F | 677 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40 - 0x4F |
| 660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x50 - 0x5F | 678 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x50 - 0x5F |
| 661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x60 - 0x6F | 679 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x60 - 0x6F |
| 662 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x70 - 0x7F | 680 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x70 - 0x7F |
| 663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8F | 681 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8F |
| 664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9F | 682 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9F |
| 665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF | 683 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF |
| 666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF | 684 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF |
| 667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF | 685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF |
| 668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF | 686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF |
| 669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF | 687 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF |
| 670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF | 688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF |
| 671 }; | 689 }; |
| 672 | 690 |
| 673 // Returns true and sets result to "application/octet-stream" if the content | 691 // Returns true and sets result to "application/octet-stream" if the content |
| 674 // appears to be binary data. Otherwise, returns false and sets "text/plain". | 692 // appears to be binary data. Otherwise, returns false and sets "text/plain". |
| 675 // Clears have_enough_content if more data could possibly change the result. | 693 // Clears have_enough_content if more data could possibly change the result. |
| 676 static bool SniffBinary(const char* content, | 694 static bool SniffBinary(const char* content, |
| 677 size_t size, | 695 size_t size, |
| 678 bool* have_enough_content, | 696 bool* have_enough_content, |
| 679 std::string* result) { | 697 std::string* result) { |
| 680 // There is no concensus about exactly how to sniff for binary content. | 698 // There is no concensus about exactly how to sniff for binary content. |
| 681 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension. | 699 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension. |
| 682 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte. | 700 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte. |
| 683 // Here, we side with FF, but with a smaller buffer. This size was chosen | 701 // Here, we side with FF, but with a smaller buffer. This size was chosen |
| 684 // because it is small enough to comfortably fit into a single packet (after | 702 // because it is small enough to comfortably fit into a single packet (after |
| 685 // allowing for headers) and yet large enough to account for binary formats | 703 // allowing for headers) and yet large enough to account for binary formats |
| 686 // that have a significant amount of ASCII at the beginning (crbug.com/15314). | 704 // that have a significant amount of ASCII at the beginning (crbug.com/15314). |
| 687 const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size); | 705 const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size); |
| 688 | 706 |
| 689 // First, we look for a BOM. | 707 // First, we look for a BOM. |
| 690 static base::HistogramBase* counter(NULL); | 708 static base::HistogramBase* counter(NULL); |
| 691 if (!counter) { | 709 if (!counter) { |
| 692 counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2", | 710 counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2", |
| 693 arraysize(kByteOrderMark)); | 711 arraysize(kByteOrderMark)); |
| 694 } | 712 } |
| 695 std::string unused; | 713 std::string unused; |
| 696 if (CheckForMagicNumbers(content, size, | 714 if (CheckForMagicNumbers(content, |
| 697 kByteOrderMark, arraysize(kByteOrderMark), | 715 size, |
| 698 counter, &unused)) { | 716 kByteOrderMark, |
| 717 arraysize(kByteOrderMark), | |
| 718 counter, | |
| 719 &unused)) { | |
| 699 // If there is BOM, we think the buffer is not binary. | 720 // If there is BOM, we think the buffer is not binary. |
| 700 result->assign("text/plain"); | 721 result->assign("text/plain"); |
| 701 return false; | 722 return false; |
| 702 } | 723 } |
| 703 | 724 |
| 704 // Next we look to see if any of the bytes "look binary." | 725 // Next we look to see if any of the bytes "look binary." |
| 705 for (size_t i = 0; i < size; ++i) { | 726 for (size_t i = 0; i < size; ++i) { |
| 706 // If we a see a binary-looking byte, we think the content is binary. | 727 // If we a see a binary-looking byte, we think the content is binary. |
| 707 if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) { | 728 if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) { |
| 708 result->assign("application/octet-stream"); | 729 result->assign("application/octet-stream"); |
| 709 return true; | 730 return true; |
| 710 } | 731 } |
| 711 } | 732 } |
| 712 | 733 |
| 713 // No evidence either way. Default to non-binary and, if truncated, clear | 734 // No evidence either way. Default to non-binary and, if truncated, clear |
| 714 // have_enough_content because there could be a binary looking byte in the | 735 // have_enough_content because there could be a binary looking byte in the |
| 715 // truncated data. | 736 // truncated data. |
| 716 *have_enough_content &= is_truncated; | 737 *have_enough_content &= is_truncated; |
| 717 result->assign("text/plain"); | 738 result->assign("text/plain"); |
| 718 return false; | 739 return false; |
| 719 } | 740 } |
| 720 | 741 |
| 721 static bool IsUnknownMimeType(const std::string& mime_type) { | 742 static bool IsUnknownMimeType(const std::string& mime_type) { |
| 722 // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here. | 743 // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here. |
| 723 // If we do, please be careful not to alter the semantics at all. | 744 // If we do, please be careful not to alter the semantics at all. |
| 724 static const char* kUnknownMimeTypes[] = { | 745 static const char* kUnknownMimeTypes[] = { |
| 725 // Empty mime types are as unknown as they get. | 746 // Empty mime types are as unknown as they get. |
| 726 "", | 747 "", |
| 727 // The unknown/unknown type is popular and uninformative | 748 // The unknown/unknown type is popular and uninformative |
| 728 "unknown/unknown", | 749 "unknown/unknown", |
| 729 // The second most popular unknown mime type is application/unknown | 750 // The second most popular unknown mime type is application/unknown |
| 730 "application/unknown", | 751 "application/unknown", |
| 731 // Firefox rejects a mime type if it is exactly */* | 752 // Firefox rejects a mime type if it is exactly */* |
| 732 "*/*", | 753 "*/*", |
| 733 }; | 754 }; |
| 734 static base::HistogramBase* counter(NULL); | 755 static base::HistogramBase* counter(NULL); |
| 735 if (!counter) { | 756 if (!counter) { |
| 736 counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2", | 757 counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2", |
| 737 arraysize(kUnknownMimeTypes) + 1); | 758 arraysize(kUnknownMimeTypes) + 1); |
| 738 } | 759 } |
| 739 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { | 760 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { |
| 740 if (mime_type == kUnknownMimeTypes[i]) { | 761 if (mime_type == kUnknownMimeTypes[i]) { |
| 741 counter->Add(i); | 762 counter->Add(i); |
| 742 return true; | 763 return true; |
| (...skipping 21 matching lines...) Expand all Loading... | |
| 764 counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3); | 785 counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3); |
| 765 | 786 |
| 766 // Technically, the crx magic number is just Cr24, but the bytes after that | 787 // Technically, the crx magic number is just Cr24, but the bytes after that |
| 767 // are a version number which changes infrequently. Including it in the | 788 // are a version number which changes infrequently. Including it in the |
| 768 // sniffing gives us less room for error. If the version number ever changes, | 789 // sniffing gives us less room for error. If the version number ever changes, |
| 769 // we can just add an entry to this list. | 790 // we can just add an entry to this list. |
| 770 // | 791 // |
| 771 // TODO(aa): If we ever have another magic number, we'll want to pass a | 792 // TODO(aa): If we ever have another magic number, we'll want to pass a |
| 772 // histogram into CheckForMagicNumbers(), below, to see which one matched. | 793 // histogram into CheckForMagicNumbers(), below, to see which one matched. |
| 773 static const struct MagicNumber kCRXMagicNumbers[] = { | 794 static const struct MagicNumber kCRXMagicNumbers[] = { |
| 774 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00") | 795 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00")}; |
| 775 }; | |
| 776 | 796 |
| 777 // Only consider files that have the extension ".crx". | 797 // Only consider files that have the extension ".crx". |
| 778 static const char kCRXExtension[] = ".crx"; | 798 static const char kCRXExtension[] = ".crx"; |
| 779 // Ignore null by subtracting 1. | 799 // Ignore null by subtracting 1. |
| 780 static const int kExtensionLength = arraysize(kCRXExtension) - 1; | 800 static const int kExtensionLength = arraysize(kCRXExtension) - 1; |
| 781 if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) == | 801 if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) == |
| 782 url.path().size() - kExtensionLength) { | 802 url.path().size() - kExtensionLength) { |
| 783 counter->Add(1); | 803 counter->Add(1); |
| 784 } else { | 804 } else { |
| 785 return false; | 805 return false; |
| 786 } | 806 } |
| 787 | 807 |
| 788 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); | 808 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); |
| 789 if (CheckForMagicNumbers(content, size, | 809 if (CheckForMagicNumbers(content, |
| 790 kCRXMagicNumbers, arraysize(kCRXMagicNumbers), | 810 size, |
| 791 NULL, result)) { | 811 kCRXMagicNumbers, |
| 812 arraysize(kCRXMagicNumbers), | |
| 813 NULL, | |
| 814 result)) { | |
| 792 counter->Add(2); | 815 counter->Add(2); |
| 793 } else { | 816 } else { |
| 794 return false; | 817 return false; |
| 795 } | 818 } |
| 796 | 819 |
| 797 return true; | 820 return true; |
| 798 } | 821 } |
| 799 | 822 |
| 800 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { | 823 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { |
| 801 static base::HistogramBase* should_sniff_counter(NULL); | 824 static base::HistogramBase* should_sniff_counter(NULL); |
| 802 if (!should_sniff_counter) { | 825 if (!should_sniff_counter) { |
| 803 should_sniff_counter = | 826 should_sniff_counter = |
| 804 UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3); | 827 UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3); |
| 805 } | 828 } |
| 806 bool sniffable_scheme = url.is_empty() || | 829 bool sniffable_scheme = url.is_empty() || url.SchemeIsHTTPOrHTTPS() || |
| 807 url.SchemeIsHTTPOrHTTPS() || | |
| 808 url.SchemeIs("ftp") || | 830 url.SchemeIs("ftp") || |
| 809 #if defined(OS_ANDROID) | 831 #if defined(OS_ANDROID) |
| 810 url.SchemeIs("content") || | 832 url.SchemeIs("content") || |
| 811 #endif | 833 #endif |
| 812 url.SchemeIsFile() || | 834 url.SchemeIsFile() || url.SchemeIsFileSystem(); |
| 813 url.SchemeIsFileSystem(); | |
| 814 if (!sniffable_scheme) { | 835 if (!sniffable_scheme) { |
| 815 should_sniff_counter->Add(1); | 836 should_sniff_counter->Add(1); |
| 816 return false; | 837 return false; |
| 817 } | 838 } |
| 818 | 839 |
| 819 static const char* kSniffableTypes[] = { | 840 static const char* |
| 820 // Many web servers are misconfigured to send text/plain for many | 841 kSniffableTypes |
| 821 // different types of content. | 842 [] = {// Many web servers are misconfigured to send text/plain for |
|
mmenke
2014/10/10 18:12:39
Just no.
| |
| 822 "text/plain", | 843 // many |
| 823 // We want to sniff application/octet-stream for | 844 // different types of content. |
| 824 // application/x-chrome-extension, but nothing else. | 845 "text/plain", |
| 825 "application/octet-stream", | 846 // We want to sniff application/octet-stream for |
| 826 // XHTML and Atom/RSS feeds are often served as plain xml instead of | 847 // application/x-chrome-extension, but nothing else. |
| 827 // their more specific mime types. | 848 "application/octet-stream", |
| 828 "text/xml", | 849 // XHTML and Atom/RSS feeds are often served as plain xml |
| 829 "application/xml", | 850 // instead of |
| 830 // Check for false Microsoft Office MIME types. | 851 // their more specific mime types. |
| 831 "application/msword", | 852 "text/xml", |
| 832 "application/vnd.ms-excel", | 853 "application/xml", |
| 833 "application/vnd.ms-powerpoint", | 854 // Check for false Microsoft Office MIME types. |
| 834 "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | 855 "application/msword", |
| 835 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | 856 "application/vnd.ms-excel", |
| 836 "application/vnd.openxmlformats-officedocument.presentationml.presentation", | 857 "application/vnd.ms-powerpoint", |
| 837 "application/vnd.ms-excel.sheet.macroenabled.12", | 858 "application/" |
| 838 "application/vnd.ms-word.document.macroenabled.12", | 859 "vnd.openxmlformats-officedocument.wordprocessingml.document", |
| 839 "application/vnd.ms-powerpoint.presentation.macroenabled.12", | 860 "application/" |
| 840 "application/mspowerpoint", | 861 "vnd.openxmlformats-officedocument.spreadsheetml.sheet", |
| 841 "application/msexcel", | 862 "application/" |
| 842 "application/vnd.ms-word", | 863 "vnd.openxmlformats-officedocument.presentationml.presentation", |
| 843 "application/vnd.ms-word.document.12", | 864 "application/vnd.ms-excel.sheet.macroenabled.12", |
| 844 "application/vnd.msword", | 865 "application/vnd.ms-word.document.macroenabled.12", |
| 845 }; | 866 "application/vnd.ms-powerpoint.presentation.macroenabled.12", |
| 867 "application/mspowerpoint", | |
| 868 "application/msexcel", | |
| 869 "application/vnd.ms-word", | |
| 870 "application/vnd.ms-word.document.12", | |
| 871 "application/vnd.msword", | |
| 872 }; | |
| 846 static base::HistogramBase* counter(NULL); | 873 static base::HistogramBase* counter(NULL); |
| 847 if (!counter) { | 874 if (!counter) { |
| 848 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2", | 875 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2", |
| 849 arraysize(kSniffableTypes) + 1); | 876 arraysize(kSniffableTypes) + 1); |
| 850 } | 877 } |
| 851 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { | 878 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { |
| 852 if (mime_type == kSniffableTypes[i]) { | 879 if (mime_type == kSniffableTypes[i]) { |
| 853 counter->Add(i); | 880 counter->Add(i); |
| 854 should_sniff_counter->Add(2); | 881 should_sniff_counter->Add(2); |
| 855 return true; | 882 return true; |
| (...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 923 // We're not interested in sniffing these types for images and the like. | 950 // We're not interested in sniffing these types for images and the like. |
| 924 // Instead, we're looking explicitly for a feed. If we don't find one | 951 // Instead, we're looking explicitly for a feed. If we don't find one |
| 925 // we're done and return early. | 952 // we're done and return early. |
| 926 if (SniffXML(content, content_size, &have_enough_content, result)) | 953 if (SniffXML(content, content_size, &have_enough_content, result)) |
| 927 return true; | 954 return true; |
| 928 return have_enough_content; | 955 return have_enough_content; |
| 929 } | 956 } |
| 930 | 957 |
| 931 // CRX files (Chrome extensions) have a special sniffing algorithm. It is | 958 // CRX files (Chrome extensions) have a special sniffing algorithm. It is |
| 932 // tighter than the others because we don't have to match legacy behavior. | 959 // tighter than the others because we don't have to match legacy behavior. |
| 933 if (SniffCRX(content, content_size, url, type_hint, | 960 if (SniffCRX( |
| 934 &have_enough_content, result)) | 961 content, content_size, url, type_hint, &have_enough_content, result)) |
| 935 return true; | 962 return true; |
| 936 | 963 |
| 937 // Check the file extension and magic numbers to see if this is an Office | 964 // Check the file extension and magic numbers to see if this is an Office |
| 938 // document. This needs to be checked before the general magic numbers | 965 // document. This needs to be checked before the general magic numbers |
| 939 // because zip files and Office documents (OOXML) have the same magic number. | 966 // because zip files and Office documents (OOXML) have the same magic number. |
| 940 if (SniffForOfficeDocs(content, content_size, url, | 967 if (SniffForOfficeDocs( |
| 941 &have_enough_content, result)) | 968 content, content_size, url, &have_enough_content, result)) |
| 942 return true; // We've matched a magic number. No more content needed. | 969 return true; // We've matched a magic number. No more content needed. |
| 943 | 970 |
| 944 // We're not interested in sniffing for magic numbers when the type_hint | 971 // We're not interested in sniffing for magic numbers when the type_hint |
| 945 // is application/octet-stream. Time to bail out. | 972 // is application/octet-stream. Time to bail out. |
| 946 if (type_hint == "application/octet-stream") | 973 if (type_hint == "application/octet-stream") |
| 947 return have_enough_content; | 974 return have_enough_content; |
| 948 | 975 |
| 949 // Now we look in our large table of magic numbers to see if we can find | 976 // Now we look in our large table of magic numbers to see if we can find |
| 950 // anything that matches the content. | 977 // anything that matches the content. |
| 951 if (SniffForMagicNumbers(content, content_size, | 978 if (SniffForMagicNumbers(content, content_size, &have_enough_content, result)) |
| 952 &have_enough_content, result)) | |
| 953 return true; // We've matched a magic number. No more content needed. | 979 return true; // We've matched a magic number. No more content needed. |
| 954 | 980 |
| 955 return have_enough_content; | 981 return have_enough_content; |
| 956 } | 982 } |
| 957 | 983 |
| 958 bool SniffMimeTypeFromLocalData(const char* content, | 984 bool SniffMimeTypeFromLocalData(const char* content, |
| 959 size_t size, | 985 size_t size, |
| 960 std::string* result) { | 986 std::string* result) { |
| 961 // First check the extra table. | 987 // First check the extra table. |
| 962 if (CheckForMagicNumbers(content, size, kExtraMagicNumbers, | 988 if (CheckForMagicNumbers(content, |
| 963 arraysize(kExtraMagicNumbers), NULL, result)) | 989 size, |
| 990 kExtraMagicNumbers, | |
| 991 arraysize(kExtraMagicNumbers), | |
| 992 NULL, | |
| 993 result)) | |
| 964 return true; | 994 return true; |
| 965 // Finally check the original table. | 995 // Finally check the original table. |
| 966 return CheckForMagicNumbers(content, size, kMagicNumbers, | 996 return CheckForMagicNumbers( |
| 967 arraysize(kMagicNumbers), NULL, result); | 997 content, size, kMagicNumbers, arraysize(kMagicNumbers), NULL, result); |
| 968 } | 998 } |
| 969 | 999 |
| 970 } // namespace net | 1000 } // namespace net |
| OLD | NEW |