| Index: appengine/monorail/framework/filecontent.py
|
| diff --git a/appengine/monorail/framework/filecontent.py b/appengine/monorail/framework/filecontent.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..ec3c1715e415c10b0b670ee22e41814b2be6fcec
|
| --- /dev/null
|
| +++ b/appengine/monorail/framework/filecontent.py
|
| @@ -0,0 +1,171 @@
|
| +# Copyright 2016 The Chromium Authors. All rights reserved.
|
| +# Use of this source code is govered by a BSD-style
|
| +# license that can be found in the LICENSE file or at
|
| +# https://developers.google.com/open-source/licenses/bsd
|
| +
|
| +"""Utility routines for dealing with MIME types and decoding text files."""
|
| +
|
| +import itertools
|
| +import logging
|
| +
|
| +from framework import framework_constants
|
| +
|
| +
|
| +_EXTENSION_TO_CTYPE_TABLE = {
|
| + # These are images/PDFs that we trust the browser to display.
|
| + 'gif': 'image/gif',
|
| + 'jpg': 'image/jpeg',
|
| + 'jpeg': 'image/jpeg',
|
| + 'png': 'image/png',
|
| + 'ico': 'image/x-icon',
|
| + 'svg': 'image/svg+xml',
|
| + 'pdf': 'application/pdf',
|
| +
|
| + # We do not serve mimetypes that cause the brower to launch a local
|
| + # app because that is not required for issue tracking and it is a
|
| + # potential security risk.
|
| +}
|
| +
|
| +
|
| +def GuessContentTypeFromFilename(filename):
|
| + """Guess a file's content type based on the filename extension.
|
| +
|
| + Args:
|
| + filename: String name of a file.
|
| +
|
| + Returns:
|
| + MIME type string to use when serving this file. We only use text/plain for
|
| + text files, appropriate image content-types, or application/octet-stream
|
| + for virtually all binary files. This limits the richness of the user's
|
| + experience, e.g., the user cannot open an MS Office application directly
|
| + by clicking on an attachment, but it is safer.
|
| + """
|
| + ext = filename.split('.')[-1] if ('.' in filename) else ''
|
| + ext = ext.lower()
|
| + if ext in COMMON_TEXT_FILE_EXTENSIONS:
|
| + return 'text/plain'
|
| + return _EXTENSION_TO_CTYPE_TABLE.get(ext.lower(), 'application/octet-stream')
|
| +
|
| +
|
| +# Constants used in detecting if a file has binary content.
|
| +# All line lengths must be below the upper limit, and there must be a spefic
|
| +# ratio below the lower limit.
|
| +_MAX_SOURCE_LINE_LEN_LOWER = 350
|
| +_MAX_SOURCE_LINE_LEN_UPPER = 800
|
| +_SOURCE_LINE_LEN_LOWER_RATIO = 0.9
|
| +
|
| +# Message to display for undecodable commit log or author values.
|
| +UNDECODABLE_LOG_CONTENT = '[Cannot be displayed]'
|
| +
|
| +# How large a repository file is in bytes before we don't try to display it
|
| +SOURCE_FILE_MAX_SIZE = 1000 * 1024
|
| +SOURCE_FILE_MAX_LINES = 50000
|
| +
|
| +# The source code browser will not attempt to display any filename ending
|
| +# with one of these extensions.
|
| +COMMON_BINARY_FILE_EXTENSIONS = {
|
| + 'gif', 'jpg', 'jpeg', 'psd', 'ico', 'icon', 'xbm', 'xpm', 'xwd', 'pcx',
|
| + 'bmp', 'png', 'vsd,' 'mpg', 'mpeg', 'wmv', 'wmf', 'avi', 'flv', 'snd',
|
| + 'mp3', 'wma', 'exe', 'dll', 'bin', 'class', 'o', 'so', 'lib', 'dylib',
|
| + 'jar', 'ear', 'war', 'par', 'msi', 'tar', 'zip', 'rar', 'cab', 'z', 'gz',
|
| + 'bz2', 'dmg', 'iso', 'rpm', 'pdf', 'eps', 'tif', 'tiff', 'xls', 'ppt',
|
| + 'graffie', 'violet',
|
| + }
|
| +
|
| +# The source code browser will display file contents as text data for files
|
| +# with the following extensions or exact filenames (assuming they decode
|
| +# correctly).
|
| +COMMON_TEXT_FILE_EXTENSIONS = (
|
| + set(framework_constants.PRETTIFY_CLASS_MAP.iterkeys()) |
|
| + { '', 'ada', 'asm', 'asp', 'bat', 'cgi', 'csv', 'el', 'emacs',
|
| + 'jsp', 'log', 'markdown', 'md', 'mf', 'plist', 'properties', 'r',
|
| + 'rc', 'txt', 'vim', 'wiki', 'xemacs', 'yacc',
|
| + })
|
| +COMMON_TEXT_FILENAMES = (
|
| + set(framework_constants.PRETTIFY_FILENAME_CLASS_MAP.iterkeys()) |
|
| + {'authors', 'install', 'readme'})
|
| +
|
| +
|
| +def DecodeFileContents(file_contents, path=None):
|
| + """Try converting file contents to unicode using utf-8 or latin-1.
|
| +
|
| + This is applicable to untrusted maybe-text from vcs files or inbound emails.
|
| +
|
| + We try decoding the file as utf-8, then fall back on latin-1. In the former
|
| + case, we call the file a text file; in the latter case, we guess whether
|
| + the file is text or binary based on line length.
|
| +
|
| + If we guess text when the file is binary, the user sees safely encoded
|
| + gibberish. If the other way around, the user sees a message that we will
|
| + not display the file.
|
| +
|
| + TODO(jrobbins): we could try the user-supplied encoding, iff it
|
| + is one of the encodings that we know that we can handle.
|
| +
|
| + Args:
|
| + file_contents: byte string from svn file. It could be text in almost
|
| + any encoding, or binary. We cannot trust the user-supplied encoding
|
| + in the mime-type property.
|
| + path: string pathname of file.
|
| +
|
| + Returns:
|
| + The tuple (unicode_string, is_binary, is_long):
|
| + - The unicode version of the string.
|
| + - is_binary is true if the string could not be decoded as text.
|
| + - is_long is true if the file has more than SOURCE_FILE_MAX_LINES lines.
|
| + """
|
| + # If the filename is one that typically identifies a binary file, then
|
| + # just treat it as binary without any further analysis.
|
| + ext = None
|
| + if path and '.' in path:
|
| + ext = path.split('.')[-1]
|
| + if ext.lower() in COMMON_BINARY_FILE_EXTENSIONS:
|
| + # If the file is binary, we don't care about the length, since we don't
|
| + # show or diff it.
|
| + return u'', True, False
|
| +
|
| + # If the string can be decoded as utf-8, we treat it as textual.
|
| + try:
|
| + u_str = file_contents.decode('utf-8', 'strict')
|
| + is_long = len(u_str.split('\n')) > SOURCE_FILE_MAX_LINES
|
| + return u_str, False, is_long
|
| + except UnicodeDecodeError:
|
| + logging.info('not a utf-8 file: %s bytes', len(file_contents))
|
| +
|
| + # Fall back on latin-1. This will always succeed, since every byte maps to
|
| + # something in latin-1, even if that something is gibberish.
|
| + u_str = file_contents.decode('latin-1', 'strict')
|
| +
|
| + lines = u_str.split('\n')
|
| + is_long = len(lines) > SOURCE_FILE_MAX_LINES
|
| + # Treat decodable files with certain filenames and/or extensions as text
|
| + # files. This avoids problems with common file types using our text/binary
|
| + # heuristic rules below.
|
| + if path:
|
| + name = path.split('/')[-1]
|
| + if (name.lower() in COMMON_TEXT_FILENAMES or
|
| + (ext and ext.lower() in COMMON_TEXT_FILE_EXTENSIONS)):
|
| + return u_str, False, is_long
|
| +
|
| + # HEURISTIC: Binary files can qualify as latin-1, so we need to
|
| + # check further. Any real source code is going to be divided into
|
| + # reasonably sized lines. All lines must be below an upper character limit,
|
| + # and most lines must be below a lower limit. This allows some exceptions
|
| + # to the lower limit, but is more restrictive than just using a single
|
| + # large character limit.
|
| + is_binary = False
|
| + if lines:
|
| + lower_count = 0
|
| + for line in itertools.islice(lines, SOURCE_FILE_MAX_LINES):
|
| + size = len(line)
|
| + if size <= _MAX_SOURCE_LINE_LEN_LOWER:
|
| + lower_count += 1
|
| + elif size > _MAX_SOURCE_LINE_LEN_UPPER:
|
| + is_binary = True
|
| + break
|
| +
|
| + ratio = lower_count / float(len(lines))
|
| + if ratio < _SOURCE_LINE_LEN_LOWER_RATIO:
|
| + is_binary = True
|
| +
|
| + return u_str, is_binary, is_long
|
|
|