OLD | NEW |
(Empty) | |
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is govered by a BSD-style |
| 3 # license that can be found in the LICENSE file or at |
| 4 # https://developers.google.com/open-source/licenses/bsd |
| 5 |
| 6 """Utility routines for dealing with MIME types and decoding text files.""" |
| 7 |
| 8 import itertools |
| 9 import logging |
| 10 |
| 11 from framework import framework_constants |
| 12 |
| 13 |
| 14 _EXTENSION_TO_CTYPE_TABLE = { |
| 15 # These are images/PDFs that we trust the browser to display. |
| 16 'gif': 'image/gif', |
| 17 'jpg': 'image/jpeg', |
| 18 'jpeg': 'image/jpeg', |
| 19 'png': 'image/png', |
| 20 'ico': 'image/x-icon', |
| 21 'svg': 'image/svg+xml', |
| 22 'pdf': 'application/pdf', |
| 23 |
| 24 # We do not serve mimetypes that cause the brower to launch a local |
| 25 # app because that is not required for issue tracking and it is a |
| 26 # potential security risk. |
| 27 } |
| 28 |
| 29 |
| 30 def GuessContentTypeFromFilename(filename): |
| 31 """Guess a file's content type based on the filename extension. |
| 32 |
| 33 Args: |
| 34 filename: String name of a file. |
| 35 |
| 36 Returns: |
| 37 MIME type string to use when serving this file. We only use text/plain for |
| 38 text files, appropriate image content-types, or application/octet-stream |
| 39 for virtually all binary files. This limits the richness of the user's |
| 40 experience, e.g., the user cannot open an MS Office application directly |
| 41 by clicking on an attachment, but it is safer. |
| 42 """ |
| 43 ext = filename.split('.')[-1] if ('.' in filename) else '' |
| 44 ext = ext.lower() |
| 45 if ext in COMMON_TEXT_FILE_EXTENSIONS: |
| 46 return 'text/plain' |
| 47 return _EXTENSION_TO_CTYPE_TABLE.get(ext.lower(), 'application/octet-stream') |
| 48 |
| 49 |
| 50 # Constants used in detecting if a file has binary content. |
| 51 # All line lengths must be below the upper limit, and there must be a spefic |
| 52 # ratio below the lower limit. |
| 53 _MAX_SOURCE_LINE_LEN_LOWER = 350 |
| 54 _MAX_SOURCE_LINE_LEN_UPPER = 800 |
| 55 _SOURCE_LINE_LEN_LOWER_RATIO = 0.9 |
| 56 |
| 57 # Message to display for undecodable commit log or author values. |
| 58 UNDECODABLE_LOG_CONTENT = '[Cannot be displayed]' |
| 59 |
| 60 # How large a repository file is in bytes before we don't try to display it |
| 61 SOURCE_FILE_MAX_SIZE = 1000 * 1024 |
| 62 SOURCE_FILE_MAX_LINES = 50000 |
| 63 |
| 64 # The source code browser will not attempt to display any filename ending |
| 65 # with one of these extensions. |
| 66 COMMON_BINARY_FILE_EXTENSIONS = { |
| 67 'gif', 'jpg', 'jpeg', 'psd', 'ico', 'icon', 'xbm', 'xpm', 'xwd', 'pcx', |
| 68 'bmp', 'png', 'vsd,' 'mpg', 'mpeg', 'wmv', 'wmf', 'avi', 'flv', 'snd', |
| 69 'mp3', 'wma', 'exe', 'dll', 'bin', 'class', 'o', 'so', 'lib', 'dylib', |
| 70 'jar', 'ear', 'war', 'par', 'msi', 'tar', 'zip', 'rar', 'cab', 'z', 'gz', |
| 71 'bz2', 'dmg', 'iso', 'rpm', 'pdf', 'eps', 'tif', 'tiff', 'xls', 'ppt', |
| 72 'graffie', 'violet', |
| 73 } |
| 74 |
| 75 # The source code browser will display file contents as text data for files |
| 76 # with the following extensions or exact filenames (assuming they decode |
| 77 # correctly). |
| 78 COMMON_TEXT_FILE_EXTENSIONS = ( |
| 79 set(framework_constants.PRETTIFY_CLASS_MAP.iterkeys()) | |
| 80 { '', 'ada', 'asm', 'asp', 'bat', 'cgi', 'csv', 'el', 'emacs', |
| 81 'jsp', 'log', 'markdown', 'md', 'mf', 'plist', 'properties', 'r', |
| 82 'rc', 'txt', 'vim', 'wiki', 'xemacs', 'yacc', |
| 83 }) |
| 84 COMMON_TEXT_FILENAMES = ( |
| 85 set(framework_constants.PRETTIFY_FILENAME_CLASS_MAP.iterkeys()) | |
| 86 {'authors', 'install', 'readme'}) |
| 87 |
| 88 |
| 89 def DecodeFileContents(file_contents, path=None): |
| 90 """Try converting file contents to unicode using utf-8 or latin-1. |
| 91 |
| 92 This is applicable to untrusted maybe-text from vcs files or inbound emails. |
| 93 |
| 94 We try decoding the file as utf-8, then fall back on latin-1. In the former |
| 95 case, we call the file a text file; in the latter case, we guess whether |
| 96 the file is text or binary based on line length. |
| 97 |
| 98 If we guess text when the file is binary, the user sees safely encoded |
| 99 gibberish. If the other way around, the user sees a message that we will |
| 100 not display the file. |
| 101 |
| 102 TODO(jrobbins): we could try the user-supplied encoding, iff it |
| 103 is one of the encodings that we know that we can handle. |
| 104 |
| 105 Args: |
| 106 file_contents: byte string from svn file. It could be text in almost |
| 107 any encoding, or binary. We cannot trust the user-supplied encoding |
| 108 in the mime-type property. |
| 109 path: string pathname of file. |
| 110 |
| 111 Returns: |
| 112 The tuple (unicode_string, is_binary, is_long): |
| 113 - The unicode version of the string. |
| 114 - is_binary is true if the string could not be decoded as text. |
| 115 - is_long is true if the file has more than SOURCE_FILE_MAX_LINES lines. |
| 116 """ |
| 117 # If the filename is one that typically identifies a binary file, then |
| 118 # just treat it as binary without any further analysis. |
| 119 ext = None |
| 120 if path and '.' in path: |
| 121 ext = path.split('.')[-1] |
| 122 if ext.lower() in COMMON_BINARY_FILE_EXTENSIONS: |
| 123 # If the file is binary, we don't care about the length, since we don't |
| 124 # show or diff it. |
| 125 return u'', True, False |
| 126 |
| 127 # If the string can be decoded as utf-8, we treat it as textual. |
| 128 try: |
| 129 u_str = file_contents.decode('utf-8', 'strict') |
| 130 is_long = len(u_str.split('\n')) > SOURCE_FILE_MAX_LINES |
| 131 return u_str, False, is_long |
| 132 except UnicodeDecodeError: |
| 133 logging.info('not a utf-8 file: %s bytes', len(file_contents)) |
| 134 |
| 135 # Fall back on latin-1. This will always succeed, since every byte maps to |
| 136 # something in latin-1, even if that something is gibberish. |
| 137 u_str = file_contents.decode('latin-1', 'strict') |
| 138 |
| 139 lines = u_str.split('\n') |
| 140 is_long = len(lines) > SOURCE_FILE_MAX_LINES |
| 141 # Treat decodable files with certain filenames and/or extensions as text |
| 142 # files. This avoids problems with common file types using our text/binary |
| 143 # heuristic rules below. |
| 144 if path: |
| 145 name = path.split('/')[-1] |
| 146 if (name.lower() in COMMON_TEXT_FILENAMES or |
| 147 (ext and ext.lower() in COMMON_TEXT_FILE_EXTENSIONS)): |
| 148 return u_str, False, is_long |
| 149 |
| 150 # HEURISTIC: Binary files can qualify as latin-1, so we need to |
| 151 # check further. Any real source code is going to be divided into |
| 152 # reasonably sized lines. All lines must be below an upper character limit, |
| 153 # and most lines must be below a lower limit. This allows some exceptions |
| 154 # to the lower limit, but is more restrictive than just using a single |
| 155 # large character limit. |
| 156 is_binary = False |
| 157 if lines: |
| 158 lower_count = 0 |
| 159 for line in itertools.islice(lines, SOURCE_FILE_MAX_LINES): |
| 160 size = len(line) |
| 161 if size <= _MAX_SOURCE_LINE_LEN_LOWER: |
| 162 lower_count += 1 |
| 163 elif size > _MAX_SOURCE_LINE_LEN_UPPER: |
| 164 is_binary = True |
| 165 break |
| 166 |
| 167 ratio = lower_count / float(len(lines)) |
| 168 if ratio < _SOURCE_LINE_LEN_LOWER_RATIO: |
| 169 is_binary = True |
| 170 |
| 171 return u_str, is_binary, is_long |
OLD | NEW |