Index: appengine/monorail/framework/filecontent.py |
diff --git a/appengine/monorail/framework/filecontent.py b/appengine/monorail/framework/filecontent.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..ec3c1715e415c10b0b670ee22e41814b2be6fcec |
--- /dev/null |
+++ b/appengine/monorail/framework/filecontent.py |
@@ -0,0 +1,171 @@ |
+# Copyright 2016 The Chromium Authors. All rights reserved. |
+# Use of this source code is govered by a BSD-style |
+# license that can be found in the LICENSE file or at |
+# https://developers.google.com/open-source/licenses/bsd |
+ |
+"""Utility routines for dealing with MIME types and decoding text files.""" |
+ |
+import itertools |
+import logging |
+ |
+from framework import framework_constants |
+ |
+ |
+_EXTENSION_TO_CTYPE_TABLE = { |
+ # These are images/PDFs that we trust the browser to display. |
+ 'gif': 'image/gif', |
+ 'jpg': 'image/jpeg', |
+ 'jpeg': 'image/jpeg', |
+ 'png': 'image/png', |
+ 'ico': 'image/x-icon', |
+ 'svg': 'image/svg+xml', |
+ 'pdf': 'application/pdf', |
+ |
+ # We do not serve mimetypes that cause the brower to launch a local |
+ # app because that is not required for issue tracking and it is a |
+ # potential security risk. |
+} |
+ |
+ |
+def GuessContentTypeFromFilename(filename): |
+ """Guess a file's content type based on the filename extension. |
+ |
+ Args: |
+ filename: String name of a file. |
+ |
+ Returns: |
+ MIME type string to use when serving this file. We only use text/plain for |
+ text files, appropriate image content-types, or application/octet-stream |
+ for virtually all binary files. This limits the richness of the user's |
+ experience, e.g., the user cannot open an MS Office application directly |
+ by clicking on an attachment, but it is safer. |
+ """ |
+ ext = filename.split('.')[-1] if ('.' in filename) else '' |
+ ext = ext.lower() |
+ if ext in COMMON_TEXT_FILE_EXTENSIONS: |
+ return 'text/plain' |
+ return _EXTENSION_TO_CTYPE_TABLE.get(ext.lower(), 'application/octet-stream') |
+ |
+ |
+# Constants used in detecting if a file has binary content. |
+# All line lengths must be below the upper limit, and there must be a spefic |
+# ratio below the lower limit. |
+_MAX_SOURCE_LINE_LEN_LOWER = 350 |
+_MAX_SOURCE_LINE_LEN_UPPER = 800 |
+_SOURCE_LINE_LEN_LOWER_RATIO = 0.9 |
+ |
+# Message to display for undecodable commit log or author values. |
+UNDECODABLE_LOG_CONTENT = '[Cannot be displayed]' |
+ |
+# How large a repository file is in bytes before we don't try to display it |
+SOURCE_FILE_MAX_SIZE = 1000 * 1024 |
+SOURCE_FILE_MAX_LINES = 50000 |
+ |
+# The source code browser will not attempt to display any filename ending |
+# with one of these extensions. |
+COMMON_BINARY_FILE_EXTENSIONS = { |
+ 'gif', 'jpg', 'jpeg', 'psd', 'ico', 'icon', 'xbm', 'xpm', 'xwd', 'pcx', |
+ 'bmp', 'png', 'vsd,' 'mpg', 'mpeg', 'wmv', 'wmf', 'avi', 'flv', 'snd', |
+ 'mp3', 'wma', 'exe', 'dll', 'bin', 'class', 'o', 'so', 'lib', 'dylib', |
+ 'jar', 'ear', 'war', 'par', 'msi', 'tar', 'zip', 'rar', 'cab', 'z', 'gz', |
+ 'bz2', 'dmg', 'iso', 'rpm', 'pdf', 'eps', 'tif', 'tiff', 'xls', 'ppt', |
+ 'graffie', 'violet', |
+ } |
+ |
+# The source code browser will display file contents as text data for files |
+# with the following extensions or exact filenames (assuming they decode |
+# correctly). |
+COMMON_TEXT_FILE_EXTENSIONS = ( |
+ set(framework_constants.PRETTIFY_CLASS_MAP.iterkeys()) | |
+ { '', 'ada', 'asm', 'asp', 'bat', 'cgi', 'csv', 'el', 'emacs', |
+ 'jsp', 'log', 'markdown', 'md', 'mf', 'plist', 'properties', 'r', |
+ 'rc', 'txt', 'vim', 'wiki', 'xemacs', 'yacc', |
+ }) |
+COMMON_TEXT_FILENAMES = ( |
+ set(framework_constants.PRETTIFY_FILENAME_CLASS_MAP.iterkeys()) | |
+ {'authors', 'install', 'readme'}) |
+ |
+ |
+def DecodeFileContents(file_contents, path=None): |
+ """Try converting file contents to unicode using utf-8 or latin-1. |
+ |
+ This is applicable to untrusted maybe-text from vcs files or inbound emails. |
+ |
+ We try decoding the file as utf-8, then fall back on latin-1. In the former |
+ case, we call the file a text file; in the latter case, we guess whether |
+ the file is text or binary based on line length. |
+ |
+ If we guess text when the file is binary, the user sees safely encoded |
+ gibberish. If the other way around, the user sees a message that we will |
+ not display the file. |
+ |
+ TODO(jrobbins): we could try the user-supplied encoding, iff it |
+ is one of the encodings that we know that we can handle. |
+ |
+ Args: |
+ file_contents: byte string from svn file. It could be text in almost |
+ any encoding, or binary. We cannot trust the user-supplied encoding |
+ in the mime-type property. |
+ path: string pathname of file. |
+ |
+ Returns: |
+ The tuple (unicode_string, is_binary, is_long): |
+ - The unicode version of the string. |
+ - is_binary is true if the string could not be decoded as text. |
+ - is_long is true if the file has more than SOURCE_FILE_MAX_LINES lines. |
+ """ |
+ # If the filename is one that typically identifies a binary file, then |
+ # just treat it as binary without any further analysis. |
+ ext = None |
+ if path and '.' in path: |
+ ext = path.split('.')[-1] |
+ if ext.lower() in COMMON_BINARY_FILE_EXTENSIONS: |
+ # If the file is binary, we don't care about the length, since we don't |
+ # show or diff it. |
+ return u'', True, False |
+ |
+ # If the string can be decoded as utf-8, we treat it as textual. |
+ try: |
+ u_str = file_contents.decode('utf-8', 'strict') |
+ is_long = len(u_str.split('\n')) > SOURCE_FILE_MAX_LINES |
+ return u_str, False, is_long |
+ except UnicodeDecodeError: |
+ logging.info('not a utf-8 file: %s bytes', len(file_contents)) |
+ |
+ # Fall back on latin-1. This will always succeed, since every byte maps to |
+ # something in latin-1, even if that something is gibberish. |
+ u_str = file_contents.decode('latin-1', 'strict') |
+ |
+ lines = u_str.split('\n') |
+ is_long = len(lines) > SOURCE_FILE_MAX_LINES |
+ # Treat decodable files with certain filenames and/or extensions as text |
+ # files. This avoids problems with common file types using our text/binary |
+ # heuristic rules below. |
+ if path: |
+ name = path.split('/')[-1] |
+ if (name.lower() in COMMON_TEXT_FILENAMES or |
+ (ext and ext.lower() in COMMON_TEXT_FILE_EXTENSIONS)): |
+ return u_str, False, is_long |
+ |
+ # HEURISTIC: Binary files can qualify as latin-1, so we need to |
+ # check further. Any real source code is going to be divided into |
+ # reasonably sized lines. All lines must be below an upper character limit, |
+ # and most lines must be below a lower limit. This allows some exceptions |
+ # to the lower limit, but is more restrictive than just using a single |
+ # large character limit. |
+ is_binary = False |
+ if lines: |
+ lower_count = 0 |
+ for line in itertools.islice(lines, SOURCE_FILE_MAX_LINES): |
+ size = len(line) |
+ if size <= _MAX_SOURCE_LINE_LEN_LOWER: |
+ lower_count += 1 |
+ elif size > _MAX_SOURCE_LINE_LEN_UPPER: |
+ is_binary = True |
+ break |
+ |
+ ratio = lower_count / float(len(lines)) |
+ if ratio < _SOURCE_LINE_LEN_LOWER_RATIO: |
+ is_binary = True |
+ |
+ return u_str, is_binary, is_long |