Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(454)

Side by Side Diff: net/base/mime_sniffer.cc

Issue 266243004: Clang format slam. Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // Detecting mime types is a tricky business because we need to balance 5 // Detecting mime types is a tricky business because we need to balance
6 // compatibility concerns with security issues. Here is a survey of how other 6 // compatibility concerns with security issues. Here is a survey of how other
7 // browsers behave and then a description of how we intend to behave. 7 // browsers behave and then a description of how we intend to behave.
8 // 8 //
9 // HTML payload, no Content-Type header: 9 // HTML payload, no Content-Type header:
10 // * IE 7: Render as HTML 10 // * IE 7: Render as HTML
(...skipping 99 matching lines...) Expand 10 before | Expand all | Expand 10 after
110 static const size_t kBytesRequiredForMagic = 42; 110 static const size_t kBytesRequiredForMagic = 42;
111 111
112 struct MagicNumber { 112 struct MagicNumber {
113 const char* mime_type; 113 const char* mime_type;
114 const char* magic; 114 const char* magic;
115 size_t magic_len; 115 size_t magic_len;
116 bool is_string; 116 bool is_string;
117 const char* mask; // if set, must have same length as |magic| 117 const char* mask; // if set, must have same length as |magic|
118 }; 118 };
119 119
120 #define MAGIC_NUMBER(mime_type, magic) \ 120 #define MAGIC_NUMBER(mime_type, magic) \
121 { (mime_type), (magic), sizeof(magic)-1, false, NULL }, 121 { (mime_type), (magic), sizeof(magic) - 1, false, NULL } \
122 ,
mmenke 2014/10/10 18:12:39 Hrm...That comma change is really weird.
122 123
123 template <int MagicSize, int MaskSize> 124 template <int MagicSize, int MaskSize>
124 class VerifySizes { 125 class VerifySizes {
125 COMPILE_ASSERT(MagicSize == MaskSize, sizes_must_be_equal); 126 COMPILE_ASSERT(MagicSize == MaskSize, sizes_must_be_equal);
127
126 public: 128 public:
127 enum { SIZES = MagicSize }; 129 enum { SIZES = MagicSize };
128 }; 130 };
129 131
130 #define verified_sizeof(magic, mask) \ 132 #define verified_sizeof(magic, mask) \
131 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES 133 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES
132 134
133 #define MAGIC_MASK(mime_type, magic, mask) \ 135 #define MAGIC_MASK(mime_type, magic, mask) \
134 { (mime_type), (magic), verified_sizeof(magic, mask)-1, false, (mask) }, 136 { (mime_type), (magic), verified_sizeof(magic, mask) - 1, false, (mask) } \
137 ,
135 138
136 // Magic strings are case insensitive and must not include '\0' characters 139 // Magic strings are case insensitive and must not include '\0' characters
137 #define MAGIC_STRING(mime_type, magic) \ 140 #define MAGIC_STRING(mime_type, magic) \
138 { (mime_type), (magic), sizeof(magic)-1, true, NULL }, 141 { (mime_type), (magic), sizeof(magic) - 1, true, NULL } \
142 ,
139 143
140 static const MagicNumber kMagicNumbers[] = { 144 static const MagicNumber kMagicNumbers[] = {
141 // Source: HTML 5 specification 145 // Source: HTML 5 specification
142 MAGIC_NUMBER("application/pdf", "%PDF-") 146 MAGIC_NUMBER("application/pdf", "%PDF-")
143 MAGIC_NUMBER("application/postscript", "%!PS-Adobe-") 147 MAGIC_NUMBER("application/postscript", "%!PS-Adobe-")
144 MAGIC_NUMBER("image/gif", "GIF87a") 148 MAGIC_NUMBER("image/gif", "GIF87a") MAGIC_NUMBER("image/gif", "GIF89a")
145 MAGIC_NUMBER("image/gif", "GIF89a") 149 MAGIC_NUMBER("image/png",
146 MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A") 150 "\x89"
147 MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF") 151 "PNG\x0D\x0A\x1A\x0A")
148 MAGIC_NUMBER("image/bmp", "BM") 152 MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF") MAGIC_NUMBER("image/bmp", "BM")
149 // Source: Mozilla 153 // Source: Mozilla
150 MAGIC_NUMBER("text/plain", "#!") // Script 154 MAGIC_NUMBER("text/plain", "#!") // Script
151 MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS 155 MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS
152 MAGIC_NUMBER("text/plain", "From") 156 MAGIC_NUMBER("text/plain", "From") MAGIC_NUMBER("text/plain", ">From")
153 MAGIC_NUMBER("text/plain", ">From") 157 // Chrome specific
154 // Chrome specific 158 MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08")
155 MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08") 159 MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46") MAGIC_NUMBER(
156 MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46") 160 "video/x-ms-asf",
157 MAGIC_NUMBER("video/x-ms-asf", 161 "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C")
158 "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C") 162 MAGIC_NUMBER("image/tiff", "I I") MAGIC_NUMBER("image/tiff", "II*")
159 MAGIC_NUMBER("image/tiff", "I I") 163 MAGIC_NUMBER("image/tiff", "MM\x00*") MAGIC_NUMBER("audio/mpeg", "ID3")
160 MAGIC_NUMBER("image/tiff", "II*") 164 MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ")
161 MAGIC_NUMBER("image/tiff", "MM\x00*") 165 MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3")
162 MAGIC_NUMBER("audio/mpeg", "ID3") 166 // TODO(abarth): we don't handle partial byte matches yet
163 MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ") 167 // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB")
164 MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3") 168 // MAGIC_NUMBER("audio/mpeg", "\xFF\xE")
165 // TODO(abarth): we don't handle partial byte matches yet 169 // MAGIC_NUMBER("audio/mpeg", "\xFF\xF")
166 // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB") 170 MAGIC_NUMBER("application/zip", "PK\x03\x04")
167 // MAGIC_NUMBER("audio/mpeg", "\xFF\xE") 171 MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00")
168 // MAGIC_NUMBER("audio/mpeg", "\xFF\xF") 172 MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A")
169 MAGIC_NUMBER("application/zip", "PK\x03\x04") 173 MAGIC_NUMBER("application/octet-stream", "MZ") // EXE
170 MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00") 174 // Sniffing for Flash:
171 MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A") 175 //
172 MAGIC_NUMBER("application/octet-stream", "MZ") // EXE 176 // MAGIC_NUMBER("application/x-shockwave-flash", "CWS")
173 // Sniffing for Flash: 177 // MAGIC_NUMBER("application/x-shockwave-flash", "FLV")
174 // 178 // MAGIC_NUMBER("application/x-shockwave-flash", "FWS")
175 // MAGIC_NUMBER("application/x-shockwave-flash", "CWS") 179 //
176 // MAGIC_NUMBER("application/x-shockwave-flash", "FLV") 180 // Including these magic number for Flash is a trade off.
177 // MAGIC_NUMBER("application/x-shockwave-flash", "FWS") 181 //
178 // 182 // Pros:
179 // Including these magic number for Flash is a trade off. 183 // * Flash is an important and popular file format
180 // 184 //
181 // Pros: 185 // Cons:
182 // * Flash is an important and popular file format 186 // * These patterns are fairly weak
183 // 187 // * If we mistakenly decide something is Flash, we will execute it
184 // Cons: 188 // in the origin of an unsuspecting site. This could be a security
185 // * These patterns are fairly weak 189 // vulnerability if the site allows users to upload content.
186 // * If we mistakenly decide something is Flash, we will execute it 190 //
187 // in the origin of an unsuspecting site. This could be a security 191 // On balance, we do not include these patterns.
188 // vulnerability if the site allows users to upload content.
189 //
190 // On balance, we do not include these patterns.
191 }; 192 };
192 193
193 // The number of content bytes we need to use all our Microsoft Office magic 194 // The number of content bytes we need to use all our Microsoft Office magic
194 // numbers. 195 // numbers.
195 static const size_t kBytesRequiredForOfficeMagic = 8; 196 static const size_t kBytesRequiredForOfficeMagic = 8;
196 197
197 static const MagicNumber kOfficeMagicNumbers[] = { 198 static const MagicNumber kOfficeMagicNumbers[] = {
198 MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1") 199 MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1")
199 MAGIC_NUMBER("OOXML", "PK\x03\x04") 200 MAGIC_NUMBER("OOXML", "PK\x03\x04")};
mmenke 2014/10/10 18:12:39 Not putting the close brace on its own line seems
200 };
201 201
202 enum OfficeDocType { 202 enum OfficeDocType {
203 DOC_TYPE_WORD, 203 DOC_TYPE_WORD,
204 DOC_TYPE_EXCEL, 204 DOC_TYPE_EXCEL,
205 DOC_TYPE_POWERPOINT, 205 DOC_TYPE_POWERPOINT,
206 DOC_TYPE_NONE 206 DOC_TYPE_NONE
207 }; 207 };
208 208
209 struct OfficeExtensionType { 209 struct OfficeExtensionType {
210 OfficeDocType doc_type; 210 OfficeDocType doc_type;
211 const char* extension; 211 const char* extension;
212 size_t extension_len; 212 size_t extension_len;
213 }; 213 };
214 214
215 #define OFFICE_EXTENSION(type, extension) \ 215 #define OFFICE_EXTENSION(type, extension) \
216 { (type), (extension), sizeof(extension) - 1 }, 216 { (type), (extension), sizeof(extension) - 1 } \
217 ,
217 218
218 static const OfficeExtensionType kOfficeExtensionTypes[] = { 219 static const OfficeExtensionType kOfficeExtensionTypes[] = {
219 OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc") 220 OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc")
220 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls") 221 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls")
221 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt") 222 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt")
222 OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx") 223 OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx")
223 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx") 224 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx")
224 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx") 225 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx")};
225 };
226 226
227 static const MagicNumber kExtraMagicNumbers[] = { 227 static const MagicNumber kExtraMagicNumbers[] = {
228 MAGIC_NUMBER("image/x-xbitmap", "#define") 228 MAGIC_NUMBER("image/x-xbitmap", "#define")
229 MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00") 229 MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00")
230 MAGIC_NUMBER("image/svg+xml", "<?xml_version=") 230 MAGIC_NUMBER("image/svg+xml", "<?xml_version=")
231 MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt ") 231 MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt ")
232 MAGIC_NUMBER("video/avi", "RIFF....AVI LIST") 232 MAGIC_NUMBER("video/avi", "RIFF....AVI LIST")
233 MAGIC_NUMBER("audio/ogg", "OggS") 233 MAGIC_NUMBER("audio/ogg", "OggS")
234 MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0") 234 MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0")
235 MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0") 235 MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0")
236 MAGIC_NUMBER("video/3gpp", "....ftyp3g") 236 MAGIC_NUMBER("video/3gpp", "....ftyp3g")
237 MAGIC_NUMBER("video/3gpp", "....ftypavcl") 237 MAGIC_NUMBER("video/3gpp", "....ftypavcl")
238 MAGIC_NUMBER("video/mp4", "....ftyp") 238 MAGIC_NUMBER("video/mp4", "....ftyp")
239 MAGIC_NUMBER("video/quicktime", "....moov") 239 MAGIC_NUMBER("video/quicktime", "....moov")
240 MAGIC_NUMBER("application/x-shockwave-flash", "CWS") 240 MAGIC_NUMBER("application/x-shockwave-flash", "CWS")
241 MAGIC_NUMBER("application/x-shockwave-flash", "FWS") 241 MAGIC_NUMBER("application/x-shockwave-flash", "FWS")
242 MAGIC_NUMBER("video/x-flv", "FLV") 242 MAGIC_NUMBER("video/x-flv", "FLV") MAGIC_NUMBER("audio/x-flac", "fLaC")
243 MAGIC_NUMBER("audio/x-flac", "fLaC")
244 243
245 // RAW image types. 244 // RAW image types.
246 MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR") 245 MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR")
247 MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR") 246 MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR")
248 MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM") 247 MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM")
249 MAGIC_NUMBER("image/x-olympus-orf", "MMOR") // big-endian 248 MAGIC_NUMBER("image/x-olympus-orf", "MMOR") // big-endian
250 MAGIC_NUMBER("image/x-olympus-orf", "IIRO") // little-endian 249 MAGIC_NUMBER("image/x-olympus-orf", "IIRO") // little-endian
251 MAGIC_NUMBER("image/x-olympus-orf", "IIRS") // little-endian 250 MAGIC_NUMBER("image/x-olympus-orf", "IIRS") // little-endian
252 MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW ") 251 MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW ")
253 MAGIC_NUMBER("image/x-panasonic-raw", 252 MAGIC_NUMBER("image/x-panasonic-raw",
254 "IIU\x00\x08\x00\x00\x00") // Panasonic .raw 253 "IIU\x00\x08\x00\x00\x00") // Panasonic .raw
255 MAGIC_NUMBER("image/x-panasonic-raw", 254 MAGIC_NUMBER("image/x-panasonic-raw",
256 "IIU\x00\x18\x00\x00\x00") // Panasonic .rw2 255 "IIU\x00\x18\x00\x00\x00") // Panasonic .rw2
257 MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw") 256 MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw")
258 MAGIC_NUMBER("image/x-x3f", "FOVb") 257 MAGIC_NUMBER("image/x-x3f", "FOVb")};
259 };
260 258
261 // Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will 259 // Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will
262 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is 260 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is
263 // HTML, but we will not. 261 // HTML, but we will not.
264 262
265 #define MAGIC_HTML_TAG(tag) \ 263 #define MAGIC_HTML_TAG(tag) MAGIC_STRING("text/html", "<" tag)
266 MAGIC_STRING("text/html", "<" tag)
267 264
268 static const MagicNumber kSniffableTags[] = { 265 static const MagicNumber kSniffableTags[] = {
269 // XML processing directive. Although this is not an HTML mime type, we sniff 266 // XML processing directive. Although this is not an HTML mime type, we
270 // for this in the HTML phase because text/xml is just as powerful as HTML and 267 // sniff
271 // we want to leverage our white space skipping technology. 268 // for this in the HTML phase because text/xml is just as powerful as HTML
272 MAGIC_NUMBER("text/xml", "<?xml") // Mozilla 269 // and
273 // DOCTYPEs 270 // we want to leverage our white space skipping technology.
274 MAGIC_HTML_TAG("!DOCTYPE html") // HTML5 spec 271 MAGIC_NUMBER("text/xml", "<?xml") // Mozilla
275 // Sniffable tags, ordered by how often they occur in sniffable documents. 272 // DOCTYPEs
276 MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla 273 MAGIC_HTML_TAG("!DOCTYPE html") // HTML5 spec
277 MAGIC_HTML_TAG("html") // HTML5 spec, Mozilla 274 // Sniffable tags, ordered by how often they occur in sniffable documents.
278 MAGIC_HTML_TAG("!--") 275 MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla
279 MAGIC_HTML_TAG("head") // HTML5 spec, Mozilla 276 MAGIC_HTML_TAG("html") // HTML5 spec, Mozilla
280 MAGIC_HTML_TAG("iframe") // Mozilla 277 MAGIC_HTML_TAG("!--") MAGIC_HTML_TAG("head") // HTML5 spec, Mozilla
281 MAGIC_HTML_TAG("h1") // Mozilla 278 MAGIC_HTML_TAG("iframe") // Mozilla
282 MAGIC_HTML_TAG("div") // Mozilla 279 MAGIC_HTML_TAG("h1") // Mozilla
283 MAGIC_HTML_TAG("font") // Mozilla 280 MAGIC_HTML_TAG("div") // Mozilla
284 MAGIC_HTML_TAG("table") // Mozilla 281 MAGIC_HTML_TAG("font") // Mozilla
285 MAGIC_HTML_TAG("a") // Mozilla 282 MAGIC_HTML_TAG("table") // Mozilla
286 MAGIC_HTML_TAG("style") // Mozilla 283 MAGIC_HTML_TAG("a") // Mozilla
287 MAGIC_HTML_TAG("title") // Mozilla 284 MAGIC_HTML_TAG("style") // Mozilla
288 MAGIC_HTML_TAG("b") // Mozilla 285 MAGIC_HTML_TAG("title") // Mozilla
289 MAGIC_HTML_TAG("body") // Mozilla 286 MAGIC_HTML_TAG("b") // Mozilla
290 MAGIC_HTML_TAG("br") 287 MAGIC_HTML_TAG("body") // Mozilla
291 MAGIC_HTML_TAG("p") // Mozilla 288 MAGIC_HTML_TAG("br") MAGIC_HTML_TAG("p") // Mozilla
292 }; 289 };
293 290
294 static base::HistogramBase* UMASnifferHistogramGet(const char* name, 291 static base::HistogramBase* UMASnifferHistogramGet(const char* name,
295 int array_size) { 292 int array_size) {
296 base::HistogramBase* counter = 293 base::HistogramBase* counter = base::LinearHistogram::FactoryGet(
297 base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size, 294 name,
298 base::HistogramBase::kUmaTargetedHistogramFlag); 295 1,
296 array_size - 1,
297 array_size,
298 base::HistogramBase::kUmaTargetedHistogramFlag);
299 return counter; 299 return counter;
300 } 300 }
301 301
302 // Compare content header to a magic number where magic_entry can contain '.' 302 // Compare content header to a magic number where magic_entry can contain '.'
303 // for single character of anything, allowing some bytes to be skipped. 303 // for single character of anything, allowing some bytes to be skipped.
304 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) { 304 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) {
305 while (len) { 305 while (len) {
306 if ((*magic_entry != '.') && (*magic_entry != *content)) 306 if ((*magic_entry != '.') && (*magic_entry != *content))
307 return false; 307 return false;
308 ++magic_entry; 308 ++magic_entry;
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
361 } 361 }
362 } 362 }
363 363
364 if (match) { 364 if (match) {
365 result->assign(magic_entry.mime_type); 365 result->assign(magic_entry.mime_type);
366 return true; 366 return true;
367 } 367 }
368 return false; 368 return false;
369 } 369 }
370 370
371 static bool CheckForMagicNumbers(const char* content, size_t size, 371 static bool CheckForMagicNumbers(const char* content,
372 const MagicNumber* magic, size_t magic_len, 372 size_t size,
373 const MagicNumber* magic,
374 size_t magic_len,
373 base::HistogramBase* counter, 375 base::HistogramBase* counter,
374 std::string* result) { 376 std::string* result) {
375 for (size_t i = 0; i < magic_len; ++i) { 377 for (size_t i = 0; i < magic_len; ++i) {
376 if (MatchMagicNumber(content, size, magic[i], result)) { 378 if (MatchMagicNumber(content, size, magic[i], result)) {
377 if (counter) counter->Add(static_cast<int>(i)); 379 if (counter)
380 counter->Add(static_cast<int>(i));
378 return true; 381 return true;
379 } 382 }
380 } 383 }
381 return false; 384 return false;
382 } 385 }
383 386
384 // Truncates |size| to |max_size| and returns true if |size| is at least 387 // Truncates |size| to |max_size| and returns true if |size| is at least
385 // |max_size|. 388 // |max_size|.
386 static bool TruncateSize(const size_t max_size, size_t* size) { 389 static bool TruncateSize(const size_t max_size, size_t* size) {
387 // Keep kMaxBytesToSniff honest. 390 // Keep kMaxBytesToSniff honest.
(...skipping 23 matching lines...) Expand all
411 for (pos = content; pos < end; ++pos) { 414 for (pos = content; pos < end; ++pos) {
412 if (!IsAsciiWhitespace(*pos)) 415 if (!IsAsciiWhitespace(*pos))
413 break; 416 break;
414 } 417 }
415 static base::HistogramBase* counter(NULL); 418 static base::HistogramBase* counter(NULL);
416 if (!counter) { 419 if (!counter) {
417 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2", 420 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2",
418 arraysize(kSniffableTags)); 421 arraysize(kSniffableTags));
419 } 422 }
420 // |pos| now points to first non-whitespace character (or at end). 423 // |pos| now points to first non-whitespace character (or at end).
421 return CheckForMagicNumbers(pos, end - pos, 424 return CheckForMagicNumbers(pos,
422 kSniffableTags, arraysize(kSniffableTags), 425 end - pos,
423 counter, result); 426 kSniffableTags,
427 arraysize(kSniffableTags),
428 counter,
429 result);
424 } 430 }
425 431
426 // Returns true and sets result if the content matches any of kMagicNumbers. 432 // Returns true and sets result if the content matches any of kMagicNumbers.
427 // Clears have_enough_content if more data could possibly change the result. 433 // Clears have_enough_content if more data could possibly change the result.
428 static bool SniffForMagicNumbers(const char* content, 434 static bool SniffForMagicNumbers(const char* content,
429 size_t size, 435 size_t size,
430 bool* have_enough_content, 436 bool* have_enough_content,
431 std::string* result) { 437 std::string* result) {
432 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); 438 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
433 439
434 // Check our big table of Magic Numbers 440 // Check our big table of Magic Numbers
435 static base::HistogramBase* counter(NULL); 441 static base::HistogramBase* counter(NULL);
436 if (!counter) { 442 if (!counter) {
437 counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2", 443 counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2",
438 arraysize(kMagicNumbers)); 444 arraysize(kMagicNumbers));
439 } 445 }
440 return CheckForMagicNumbers(content, size, 446 return CheckForMagicNumbers(
441 kMagicNumbers, arraysize(kMagicNumbers), 447 content, size, kMagicNumbers, arraysize(kMagicNumbers), counter, result);
442 counter, result);
443 } 448 }
444 449
445 // Returns true and sets result if the content matches any of 450 // Returns true and sets result if the content matches any of
446 // kOfficeMagicNumbers, and the URL has the proper extension. 451 // kOfficeMagicNumbers, and the URL has the proper extension.
447 // Clears |have_enough_content| if more data could possibly change the result. 452 // Clears |have_enough_content| if more data could possibly change the result.
448 static bool SniffForOfficeDocs(const char* content, 453 static bool SniffForOfficeDocs(const char* content,
449 size_t size, 454 size_t size,
450 const GURL& url, 455 const GURL& url,
451 bool* have_enough_content, 456 bool* have_enough_content,
452 std::string* result) { 457 std::string* result) {
453 *have_enough_content &= TruncateSize(kBytesRequiredForOfficeMagic, &size); 458 *have_enough_content &= TruncateSize(kBytesRequiredForOfficeMagic, &size);
454 459
455 // Check our table of magic numbers for Office file types. 460 // Check our table of magic numbers for Office file types.
456 std::string office_version; 461 std::string office_version;
457 if (!CheckForMagicNumbers(content, size, 462 if (!CheckForMagicNumbers(content,
458 kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers), 463 size,
459 NULL, &office_version)) 464 kOfficeMagicNumbers,
465 arraysize(kOfficeMagicNumbers),
466 NULL,
467 &office_version))
460 return false; 468 return false;
461 469
462 OfficeDocType type = DOC_TYPE_NONE; 470 OfficeDocType type = DOC_TYPE_NONE;
463 for (size_t i = 0; i < arraysize(kOfficeExtensionTypes); ++i) { 471 for (size_t i = 0; i < arraysize(kOfficeExtensionTypes); ++i) {
464 std::string url_path = url.path(); 472 std::string url_path = url.path();
465 473
466 if (url_path.length() < kOfficeExtensionTypes[i].extension_len) 474 if (url_path.length() < kOfficeExtensionTypes[i].extension_len)
467 continue; 475 continue;
468 476
469 const char* extension = 477 const char* extension =
470 &url_path[url_path.length() - kOfficeExtensionTypes[i].extension_len]; 478 &url_path[url_path.length() - kOfficeExtensionTypes[i].extension_len];
471 479
472 if (0 == base::strncasecmp(extension, kOfficeExtensionTypes[i].extension, 480 if (0 == base::strncasecmp(extension,
481 kOfficeExtensionTypes[i].extension,
473 kOfficeExtensionTypes[i].extension_len)) { 482 kOfficeExtensionTypes[i].extension_len)) {
474 type = kOfficeExtensionTypes[i].doc_type; 483 type = kOfficeExtensionTypes[i].doc_type;
475 break; 484 break;
476 } 485 }
477 } 486 }
478 487
479 if (type == DOC_TYPE_NONE) 488 if (type == DOC_TYPE_NONE)
480 return false; 489 return false;
481 490
482 if (office_version == "CFB") { 491 if (office_version == "CFB") {
483 switch (type) { 492 switch (type) {
484 case DOC_TYPE_WORD: 493 case DOC_TYPE_WORD:
485 *result = "application/msword"; 494 *result = "application/msword";
486 return true; 495 return true;
487 case DOC_TYPE_EXCEL: 496 case DOC_TYPE_EXCEL:
488 *result = "application/vnd.ms-excel"; 497 *result = "application/vnd.ms-excel";
489 return true; 498 return true;
490 case DOC_TYPE_POWERPOINT: 499 case DOC_TYPE_POWERPOINT:
491 *result = "application/vnd.ms-powerpoint"; 500 *result = "application/vnd.ms-powerpoint";
492 return true; 501 return true;
493 case DOC_TYPE_NONE: 502 case DOC_TYPE_NONE:
494 NOTREACHED(); 503 NOTREACHED();
495 return false; 504 return false;
496 } 505 }
497 } else if (office_version == "OOXML") { 506 } else if (office_version == "OOXML") {
498 switch (type) { 507 switch (type) {
499 case DOC_TYPE_WORD: 508 case DOC_TYPE_WORD:
500 *result = "application/vnd.openxmlformats-officedocument." 509 *result =
501 "wordprocessingml.document"; 510 "application/vnd.openxmlformats-officedocument."
511 "wordprocessingml.document";
502 return true; 512 return true;
503 case DOC_TYPE_EXCEL: 513 case DOC_TYPE_EXCEL:
504 *result = "application/vnd.openxmlformats-officedocument." 514 *result =
505 "spreadsheetml.sheet"; 515 "application/vnd.openxmlformats-officedocument."
516 "spreadsheetml.sheet";
506 return true; 517 return true;
507 case DOC_TYPE_POWERPOINT: 518 case DOC_TYPE_POWERPOINT:
508 *result = "application/vnd.openxmlformats-officedocument." 519 *result =
509 "presentationml.presentation"; 520 "application/vnd.openxmlformats-officedocument."
521 "presentationml.presentation";
510 return true; 522 return true;
511 case DOC_TYPE_NONE: 523 case DOC_TYPE_NONE:
512 NOTREACHED(); 524 NOTREACHED();
513 return false; 525 return false;
514 } 526 }
515 } 527 }
516 528
517 NOTREACHED(); 529 NOTREACHED();
518 return false; 530 return false;
519 } 531 }
520 532
521 static bool IsOfficeType(const std::string& type_hint) { 533 static bool IsOfficeType(const std::string& type_hint) {
522 return (type_hint == "application/msword" || 534 return (type_hint == "application/msword" ||
523 type_hint == "application/vnd.ms-excel" || 535 type_hint == "application/vnd.ms-excel" ||
524 type_hint == "application/vnd.ms-powerpoint" || 536 type_hint == "application/vnd.ms-powerpoint" ||
525 type_hint == "application/vnd.openxmlformats-officedocument." 537 type_hint ==
526 "wordprocessingml.document" || 538 "application/vnd.openxmlformats-officedocument."
527 type_hint == "application/vnd.openxmlformats-officedocument." 539 "wordprocessingml.document" ||
528 "spreadsheetml.sheet" || 540 type_hint ==
529 type_hint == "application/vnd.openxmlformats-officedocument." 541 "application/vnd.openxmlformats-officedocument."
530 "presentationml.presentation" || 542 "spreadsheetml.sheet" ||
543 type_hint ==
544 "application/vnd.openxmlformats-officedocument."
545 "presentationml.presentation" ||
531 type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" || 546 type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" ||
532 type_hint == "application/vnd.ms-word.document.macroenabled.12" || 547 type_hint == "application/vnd.ms-word.document.macroenabled.12" ||
533 type_hint == "application/vnd.ms-powerpoint.presentation." 548 type_hint ==
534 "macroenabled.12" || 549 "application/vnd.ms-powerpoint.presentation."
550 "macroenabled.12" ||
535 type_hint == "application/mspowerpoint" || 551 type_hint == "application/mspowerpoint" ||
536 type_hint == "application/msexcel" || 552 type_hint == "application/msexcel" ||
537 type_hint == "application/vnd.ms-word" || 553 type_hint == "application/vnd.ms-word" ||
538 type_hint == "application/vnd.ms-word.document.12" || 554 type_hint == "application/vnd.ms-word.document.12" ||
539 type_hint == "application/vnd.msword"); 555 type_hint == "application/vnd.msword");
540 } 556 }
541 557
542 // This function checks for files that have a Microsoft Office MIME type 558 // This function checks for files that have a Microsoft Office MIME type
543 // set, but are not actually Office files. 559 // set, but are not actually Office files.
544 // 560 //
545 // If this is not actually an Office file, |*result| is set to 561 // If this is not actually an Office file, |*result| is set to
546 // "application/octet-stream", otherwise it is not modified. 562 // "application/octet-stream", otherwise it is not modified.
547 // 563 //
548 // Returns false if additional data is required to determine the file type, or 564 // Returns false if additional data is required to determine the file type, or
549 // true if there is enough data to make a decision. 565 // true if there is enough data to make a decision.
550 static bool SniffForInvalidOfficeDocs(const char* content, 566 static bool SniffForInvalidOfficeDocs(const char* content,
551 size_t size, 567 size_t size,
552 const GURL& url, 568 const GURL& url,
553 std::string* result) { 569 std::string* result) {
554 if (!TruncateSize(kBytesRequiredForOfficeMagic, &size)) 570 if (!TruncateSize(kBytesRequiredForOfficeMagic, &size))
555 return false; 571 return false;
556 572
557 // Check our table of magic numbers for Office file types. If it does not 573 // Check our table of magic numbers for Office file types. If it does not
558 // match one, the MIME type was invalid. Set it instead to a safe value. 574 // match one, the MIME type was invalid. Set it instead to a safe value.
559 std::string office_version; 575 std::string office_version;
560 if (!CheckForMagicNumbers(content, size, 576 if (!CheckForMagicNumbers(content,
561 kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers), 577 size,
562 NULL, &office_version)) { 578 kOfficeMagicNumbers,
579 arraysize(kOfficeMagicNumbers),
580 NULL,
581 &office_version)) {
563 *result = "application/octet-stream"; 582 *result = "application/octet-stream";
564 } 583 }
565 584
566 // We have enough information to determine if this was a Microsoft Office 585 // We have enough information to determine if this was a Microsoft Office
567 // document or not, so sniffing is completed. 586 // document or not, so sniffing is completed.
568 return true; 587 return true;
569 } 588 }
570 589
571 // Byte order marks 590 // Byte order marks
572 static const MagicNumber kMagicXML[] = { 591 static const MagicNumber kMagicXML[] = {
573 // We want to be very conservative in interpreting text/xml content as 592 // We want to be very conservative in interpreting text/xml content as
574 // XHTML -- we just want to sniff enough to make unit tests pass. 593 // XHTML -- we just want to sniff enough to make unit tests pass.
575 // So we match explicitly on this, and don't match other ways of writing 594 // So we match explicitly on this, and don't match other ways of writing
576 // it in semantically-equivalent ways. 595 // it in semantically-equivalent ways.
577 MAGIC_STRING("application/xhtml+xml", 596 MAGIC_STRING("application/xhtml+xml",
578 "<html xmlns=\"http://www.w3.org/1999/xhtml\"") 597 "<html xmlns=\"http://www.w3.org/1999/xhtml\"")
579 MAGIC_STRING("application/atom+xml", "<feed") 598 MAGIC_STRING("application/atom+xml", "<feed")
580 MAGIC_STRING("application/rss+xml", "<rss") // UTF-8 599 MAGIC_STRING("application/rss+xml", "<rss") // UTF-8
581 }; 600 };
582 601
583 // Returns true and sets result if the content appears to contain XHTML or a 602 // Returns true and sets result if the content appears to contain XHTML or a
584 // feed. 603 // feed.
585 // Clears have_enough_content if more data could possibly change the result. 604 // Clears have_enough_content if more data could possibly change the result.
586 // 605 //
587 // TODO(evanm): this is similar but more conservative than what Safari does, 606 // TODO(evanm): this is similar but more conservative than what Safari does,
588 // while HTML5 has a different recommendation -- what should we do? 607 // while HTML5 has a different recommendation -- what should we do?
589 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset 608 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset
590 // of ASCII -- do we care? 609 // of ASCII -- do we care?
591 static bool SniffXML(const char* content, 610 static bool SniffXML(const char* content,
592 size_t size, 611 size_t size,
593 bool* have_enough_content, 612 bool* have_enough_content,
594 std::string* result) { 613 std::string* result) {
595 // We allow at most 300 bytes of content before we expect the opening tag. 614 // We allow at most 300 bytes of content before we expect the opening tag.
596 *have_enough_content &= TruncateSize(300, &size); 615 *have_enough_content &= TruncateSize(300, &size);
597 const char* pos = content; 616 const char* pos = content;
598 const char* const end = content + size; 617 const char* const end = content + size;
599 618
600 // This loop iterates through tag-looking offsets in the file. 619 // This loop iterates through tag-looking offsets in the file.
601 // We want to skip XML processing instructions (of the form "<?xml ...") 620 // We want to skip XML processing instructions (of the form "<?xml ...")
602 // and stop at the first "plain" tag, then make a decision on the mime-type 621 // and stop at the first "plain" tag, then make a decision on the mime-type
603 // based on the name (or possibly attributes) of that tag. 622 // based on the name (or possibly attributes) of that tag.
604 static base::HistogramBase* counter(NULL); 623 static base::HistogramBase* counter(NULL);
605 if (!counter) { 624 if (!counter) {
606 counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2", 625 counter =
607 arraysize(kMagicXML)); 626 UMASnifferHistogramGet("mime_sniffer.kMagicXML2", arraysize(kMagicXML));
608 } 627 }
609 const int kMaxTagIterations = 5; 628 const int kMaxTagIterations = 5;
610 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { 629 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) {
611 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos)); 630 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos));
612 if (!pos) 631 if (!pos)
613 return false; 632 return false;
614 633
615 if (base::strncasecmp(pos, "<?xml", sizeof("<?xml") - 1) == 0) { 634 if (base::strncasecmp(pos, "<?xml", sizeof("<?xml") - 1) == 0) {
616 // Skip XML declarations. 635 // Skip XML declarations.
617 ++pos; 636 ++pos;
618 continue; 637 continue;
619 } else if (base::strncasecmp(pos, "<!DOCTYPE", 638 } else if (base::strncasecmp(pos, "<!DOCTYPE", sizeof("<!DOCTYPE") - 1) ==
620 sizeof("<!DOCTYPE") - 1) == 0) { 639 0) {
mmenke 2014/10/10 18:12:39 Think this is pretty ugly - I find no extra indent
621 // Skip DOCTYPE declarations. 640 // Skip DOCTYPE declarations.
622 ++pos; 641 ++pos;
623 continue; 642 continue;
624 } 643 }
625 644
626 if (CheckForMagicNumbers(pos, end - pos, 645 if (CheckForMagicNumbers(
627 kMagicXML, arraysize(kMagicXML), 646 pos, end - pos, kMagicXML, arraysize(kMagicXML), counter, result))
628 counter, result))
629 return true; 647 return true;
mmenke 2014/10/10 18:12:39 This is a style violation - when an if body takes
630 648
631 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult 649 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult
632 // to identify. 650 // to identify.
633 651
634 // If we get here, we've hit an initial tag that hasn't matched one of the 652 // If we get here, we've hit an initial tag that hasn't matched one of the
635 // above tests. Abort. 653 // above tests. Abort.
636 return true; 654 return true;
637 } 655 }
638 656
639 // We iterated too far without finding a start tag. 657 // We iterated too far without finding a start tag.
640 // If we have more content to look at, we aren't going to change our mind by 658 // If we have more content to look at, we aren't going to change our mind by
641 // seeing more bytes from the network. 659 // seeing more bytes from the network.
642 return pos < end; 660 return pos < end;
643 } 661 }
644 662
645 // Byte order marks 663 // Byte order marks
646 static const MagicNumber kByteOrderMark[] = { 664 static const MagicNumber kByteOrderMark[] = {
647 MAGIC_NUMBER("text/plain", "\xFE\xFF") // UTF-16BE 665 MAGIC_NUMBER("text/plain", "\xFE\xFF") // UTF-16BE
648 MAGIC_NUMBER("text/plain", "\xFF\xFE") // UTF-16LE 666 MAGIC_NUMBER("text/plain", "\xFF\xFE") // UTF-16LE
649 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8 667 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8
650 }; 668 };
651 669
652 // Whether a given byte looks like it might be part of binary content. 670 // Whether a given byte looks like it might be part of binary content.
653 // Source: HTML5 spec 671 // Source: HTML5 spec
654 static char kByteLooksBinary[] = { 672 static char kByteLooksBinary[] = {
655 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F 673 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F 674 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F
657 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F 675 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F
658 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F 676 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40 - 0x4F 677 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40 - 0x4F
660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x50 - 0x5F 678 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x50 - 0x5F
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x60 - 0x6F 679 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x60 - 0x6F
662 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x70 - 0x7F 680 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x70 - 0x7F
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8F 681 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8F
664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9F 682 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9F
665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF 683 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF
666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF 684 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF
667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF 685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF
668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF 686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF
669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF 687 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF
670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF 688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF
671 }; 689 };
672 690
673 // Returns true and sets result to "application/octet-stream" if the content 691 // Returns true and sets result to "application/octet-stream" if the content
674 // appears to be binary data. Otherwise, returns false and sets "text/plain". 692 // appears to be binary data. Otherwise, returns false and sets "text/plain".
675 // Clears have_enough_content if more data could possibly change the result. 693 // Clears have_enough_content if more data could possibly change the result.
676 static bool SniffBinary(const char* content, 694 static bool SniffBinary(const char* content,
677 size_t size, 695 size_t size,
678 bool* have_enough_content, 696 bool* have_enough_content,
679 std::string* result) { 697 std::string* result) {
680 // There is no concensus about exactly how to sniff for binary content. 698 // There is no concensus about exactly how to sniff for binary content.
681 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension. 699 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
682 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte. 700 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
683 // Here, we side with FF, but with a smaller buffer. This size was chosen 701 // Here, we side with FF, but with a smaller buffer. This size was chosen
684 // because it is small enough to comfortably fit into a single packet (after 702 // because it is small enough to comfortably fit into a single packet (after
685 // allowing for headers) and yet large enough to account for binary formats 703 // allowing for headers) and yet large enough to account for binary formats
686 // that have a significant amount of ASCII at the beginning (crbug.com/15314). 704 // that have a significant amount of ASCII at the beginning (crbug.com/15314).
687 const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size); 705 const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size);
688 706
689 // First, we look for a BOM. 707 // First, we look for a BOM.
690 static base::HistogramBase* counter(NULL); 708 static base::HistogramBase* counter(NULL);
691 if (!counter) { 709 if (!counter) {
692 counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2", 710 counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2",
693 arraysize(kByteOrderMark)); 711 arraysize(kByteOrderMark));
694 } 712 }
695 std::string unused; 713 std::string unused;
696 if (CheckForMagicNumbers(content, size, 714 if (CheckForMagicNumbers(content,
697 kByteOrderMark, arraysize(kByteOrderMark), 715 size,
698 counter, &unused)) { 716 kByteOrderMark,
717 arraysize(kByteOrderMark),
718 counter,
719 &unused)) {
699 // If there is BOM, we think the buffer is not binary. 720 // If there is BOM, we think the buffer is not binary.
700 result->assign("text/plain"); 721 result->assign("text/plain");
701 return false; 722 return false;
702 } 723 }
703 724
704 // Next we look to see if any of the bytes "look binary." 725 // Next we look to see if any of the bytes "look binary."
705 for (size_t i = 0; i < size; ++i) { 726 for (size_t i = 0; i < size; ++i) {
706 // If we a see a binary-looking byte, we think the content is binary. 727 // If we a see a binary-looking byte, we think the content is binary.
707 if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) { 728 if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) {
708 result->assign("application/octet-stream"); 729 result->assign("application/octet-stream");
709 return true; 730 return true;
710 } 731 }
711 } 732 }
712 733
713 // No evidence either way. Default to non-binary and, if truncated, clear 734 // No evidence either way. Default to non-binary and, if truncated, clear
714 // have_enough_content because there could be a binary looking byte in the 735 // have_enough_content because there could be a binary looking byte in the
715 // truncated data. 736 // truncated data.
716 *have_enough_content &= is_truncated; 737 *have_enough_content &= is_truncated;
717 result->assign("text/plain"); 738 result->assign("text/plain");
718 return false; 739 return false;
719 } 740 }
720 741
721 static bool IsUnknownMimeType(const std::string& mime_type) { 742 static bool IsUnknownMimeType(const std::string& mime_type) {
722 // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here. 743 // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here.
723 // If we do, please be careful not to alter the semantics at all. 744 // If we do, please be careful not to alter the semantics at all.
724 static const char* kUnknownMimeTypes[] = { 745 static const char* kUnknownMimeTypes[] = {
725 // Empty mime types are as unknown as they get. 746 // Empty mime types are as unknown as they get.
726 "", 747 "",
727 // The unknown/unknown type is popular and uninformative 748 // The unknown/unknown type is popular and uninformative
728 "unknown/unknown", 749 "unknown/unknown",
729 // The second most popular unknown mime type is application/unknown 750 // The second most popular unknown mime type is application/unknown
730 "application/unknown", 751 "application/unknown",
731 // Firefox rejects a mime type if it is exactly */* 752 // Firefox rejects a mime type if it is exactly */*
732 "*/*", 753 "*/*",
733 }; 754 };
734 static base::HistogramBase* counter(NULL); 755 static base::HistogramBase* counter(NULL);
735 if (!counter) { 756 if (!counter) {
736 counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2", 757 counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2",
737 arraysize(kUnknownMimeTypes) + 1); 758 arraysize(kUnknownMimeTypes) + 1);
738 } 759 }
739 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { 760 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) {
740 if (mime_type == kUnknownMimeTypes[i]) { 761 if (mime_type == kUnknownMimeTypes[i]) {
741 counter->Add(i); 762 counter->Add(i);
742 return true; 763 return true;
(...skipping 21 matching lines...) Expand all
764 counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3); 785 counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3);
765 786
766 // Technically, the crx magic number is just Cr24, but the bytes after that 787 // Technically, the crx magic number is just Cr24, but the bytes after that
767 // are a version number which changes infrequently. Including it in the 788 // are a version number which changes infrequently. Including it in the
768 // sniffing gives us less room for error. If the version number ever changes, 789 // sniffing gives us less room for error. If the version number ever changes,
769 // we can just add an entry to this list. 790 // we can just add an entry to this list.
770 // 791 //
771 // TODO(aa): If we ever have another magic number, we'll want to pass a 792 // TODO(aa): If we ever have another magic number, we'll want to pass a
772 // histogram into CheckForMagicNumbers(), below, to see which one matched. 793 // histogram into CheckForMagicNumbers(), below, to see which one matched.
773 static const struct MagicNumber kCRXMagicNumbers[] = { 794 static const struct MagicNumber kCRXMagicNumbers[] = {
774 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00") 795 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00")};
775 };
776 796
777 // Only consider files that have the extension ".crx". 797 // Only consider files that have the extension ".crx".
778 static const char kCRXExtension[] = ".crx"; 798 static const char kCRXExtension[] = ".crx";
779 // Ignore null by subtracting 1. 799 // Ignore null by subtracting 1.
780 static const int kExtensionLength = arraysize(kCRXExtension) - 1; 800 static const int kExtensionLength = arraysize(kCRXExtension) - 1;
781 if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) == 801 if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) ==
782 url.path().size() - kExtensionLength) { 802 url.path().size() - kExtensionLength) {
783 counter->Add(1); 803 counter->Add(1);
784 } else { 804 } else {
785 return false; 805 return false;
786 } 806 }
787 807
788 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); 808 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
789 if (CheckForMagicNumbers(content, size, 809 if (CheckForMagicNumbers(content,
790 kCRXMagicNumbers, arraysize(kCRXMagicNumbers), 810 size,
791 NULL, result)) { 811 kCRXMagicNumbers,
812 arraysize(kCRXMagicNumbers),
813 NULL,
814 result)) {
792 counter->Add(2); 815 counter->Add(2);
793 } else { 816 } else {
794 return false; 817 return false;
795 } 818 }
796 819
797 return true; 820 return true;
798 } 821 }
799 822
800 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { 823 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) {
801 static base::HistogramBase* should_sniff_counter(NULL); 824 static base::HistogramBase* should_sniff_counter(NULL);
802 if (!should_sniff_counter) { 825 if (!should_sniff_counter) {
803 should_sniff_counter = 826 should_sniff_counter =
804 UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3); 827 UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3);
805 } 828 }
806 bool sniffable_scheme = url.is_empty() || 829 bool sniffable_scheme = url.is_empty() || url.SchemeIsHTTPOrHTTPS() ||
807 url.SchemeIsHTTPOrHTTPS() ||
808 url.SchemeIs("ftp") || 830 url.SchemeIs("ftp") ||
809 #if defined(OS_ANDROID) 831 #if defined(OS_ANDROID)
810 url.SchemeIs("content") || 832 url.SchemeIs("content") ||
811 #endif 833 #endif
812 url.SchemeIsFile() || 834 url.SchemeIsFile() || url.SchemeIsFileSystem();
813 url.SchemeIsFileSystem();
814 if (!sniffable_scheme) { 835 if (!sniffable_scheme) {
815 should_sniff_counter->Add(1); 836 should_sniff_counter->Add(1);
816 return false; 837 return false;
817 } 838 }
818 839
819 static const char* kSniffableTypes[] = { 840 static const char*
820 // Many web servers are misconfigured to send text/plain for many 841 kSniffableTypes
821 // different types of content. 842 [] = {// Many web servers are misconfigured to send text/plain for
mmenke 2014/10/10 18:12:39 Just no.
822 "text/plain", 843 // many
823 // We want to sniff application/octet-stream for 844 // different types of content.
824 // application/x-chrome-extension, but nothing else. 845 "text/plain",
825 "application/octet-stream", 846 // We want to sniff application/octet-stream for
826 // XHTML and Atom/RSS feeds are often served as plain xml instead of 847 // application/x-chrome-extension, but nothing else.
827 // their more specific mime types. 848 "application/octet-stream",
828 "text/xml", 849 // XHTML and Atom/RSS feeds are often served as plain xml
829 "application/xml", 850 // instead of
830 // Check for false Microsoft Office MIME types. 851 // their more specific mime types.
831 "application/msword", 852 "text/xml",
832 "application/vnd.ms-excel", 853 "application/xml",
833 "application/vnd.ms-powerpoint", 854 // Check for false Microsoft Office MIME types.
834 "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 855 "application/msword",
835 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 856 "application/vnd.ms-excel",
836 "application/vnd.openxmlformats-officedocument.presentationml.presentation", 857 "application/vnd.ms-powerpoint",
837 "application/vnd.ms-excel.sheet.macroenabled.12", 858 "application/"
838 "application/vnd.ms-word.document.macroenabled.12", 859 "vnd.openxmlformats-officedocument.wordprocessingml.document",
839 "application/vnd.ms-powerpoint.presentation.macroenabled.12", 860 "application/"
840 "application/mspowerpoint", 861 "vnd.openxmlformats-officedocument.spreadsheetml.sheet",
841 "application/msexcel", 862 "application/"
842 "application/vnd.ms-word", 863 "vnd.openxmlformats-officedocument.presentationml.presentation",
843 "application/vnd.ms-word.document.12", 864 "application/vnd.ms-excel.sheet.macroenabled.12",
844 "application/vnd.msword", 865 "application/vnd.ms-word.document.macroenabled.12",
845 }; 866 "application/vnd.ms-powerpoint.presentation.macroenabled.12",
867 "application/mspowerpoint",
868 "application/msexcel",
869 "application/vnd.ms-word",
870 "application/vnd.ms-word.document.12",
871 "application/vnd.msword",
872 };
846 static base::HistogramBase* counter(NULL); 873 static base::HistogramBase* counter(NULL);
847 if (!counter) { 874 if (!counter) {
848 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2", 875 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2",
849 arraysize(kSniffableTypes) + 1); 876 arraysize(kSniffableTypes) + 1);
850 } 877 }
851 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { 878 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) {
852 if (mime_type == kSniffableTypes[i]) { 879 if (mime_type == kSniffableTypes[i]) {
853 counter->Add(i); 880 counter->Add(i);
854 should_sniff_counter->Add(2); 881 should_sniff_counter->Add(2);
855 return true; 882 return true;
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
923 // We're not interested in sniffing these types for images and the like. 950 // We're not interested in sniffing these types for images and the like.
924 // Instead, we're looking explicitly for a feed. If we don't find one 951 // Instead, we're looking explicitly for a feed. If we don't find one
925 // we're done and return early. 952 // we're done and return early.
926 if (SniffXML(content, content_size, &have_enough_content, result)) 953 if (SniffXML(content, content_size, &have_enough_content, result))
927 return true; 954 return true;
928 return have_enough_content; 955 return have_enough_content;
929 } 956 }
930 957
931 // CRX files (Chrome extensions) have a special sniffing algorithm. It is 958 // CRX files (Chrome extensions) have a special sniffing algorithm. It is
932 // tighter than the others because we don't have to match legacy behavior. 959 // tighter than the others because we don't have to match legacy behavior.
933 if (SniffCRX(content, content_size, url, type_hint, 960 if (SniffCRX(
934 &have_enough_content, result)) 961 content, content_size, url, type_hint, &have_enough_content, result))
935 return true; 962 return true;
936 963
937 // Check the file extension and magic numbers to see if this is an Office 964 // Check the file extension and magic numbers to see if this is an Office
938 // document. This needs to be checked before the general magic numbers 965 // document. This needs to be checked before the general magic numbers
939 // because zip files and Office documents (OOXML) have the same magic number. 966 // because zip files and Office documents (OOXML) have the same magic number.
940 if (SniffForOfficeDocs(content, content_size, url, 967 if (SniffForOfficeDocs(
941 &have_enough_content, result)) 968 content, content_size, url, &have_enough_content, result))
942 return true; // We've matched a magic number. No more content needed. 969 return true; // We've matched a magic number. No more content needed.
943 970
944 // We're not interested in sniffing for magic numbers when the type_hint 971 // We're not interested in sniffing for magic numbers when the type_hint
945 // is application/octet-stream. Time to bail out. 972 // is application/octet-stream. Time to bail out.
946 if (type_hint == "application/octet-stream") 973 if (type_hint == "application/octet-stream")
947 return have_enough_content; 974 return have_enough_content;
948 975
949 // Now we look in our large table of magic numbers to see if we can find 976 // Now we look in our large table of magic numbers to see if we can find
950 // anything that matches the content. 977 // anything that matches the content.
951 if (SniffForMagicNumbers(content, content_size, 978 if (SniffForMagicNumbers(content, content_size, &have_enough_content, result))
952 &have_enough_content, result))
953 return true; // We've matched a magic number. No more content needed. 979 return true; // We've matched a magic number. No more content needed.
954 980
955 return have_enough_content; 981 return have_enough_content;
956 } 982 }
957 983
958 bool SniffMimeTypeFromLocalData(const char* content, 984 bool SniffMimeTypeFromLocalData(const char* content,
959 size_t size, 985 size_t size,
960 std::string* result) { 986 std::string* result) {
961 // First check the extra table. 987 // First check the extra table.
962 if (CheckForMagicNumbers(content, size, kExtraMagicNumbers, 988 if (CheckForMagicNumbers(content,
963 arraysize(kExtraMagicNumbers), NULL, result)) 989 size,
990 kExtraMagicNumbers,
991 arraysize(kExtraMagicNumbers),
992 NULL,
993 result))
964 return true; 994 return true;
965 // Finally check the original table. 995 // Finally check the original table.
966 return CheckForMagicNumbers(content, size, kMagicNumbers, 996 return CheckForMagicNumbers(
967 arraysize(kMagicNumbers), NULL, result); 997 content, size, kMagicNumbers, arraysize(kMagicNumbers), NULL, result);
968 } 998 }
969 999
970 } // namespace net 1000 } // namespace net
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698