appengine/monorail/framework/filecontent.py - Issue 1868553004: Open Source Monorail

Unified Diff: appengine/monorail/framework/filecontent.py

Issue 1868553004: Open Source Monorail (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Rebase Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: appengine/monorail/framework/filecontent.py

diff --git a/appengine/monorail/framework/filecontent.py b/appengine/monorail/framework/filecontent.py

new file mode 100644

index 0000000000000000000000000000000000000000..ec3c1715e415c10b0b670ee22e41814b2be6fcec

--- /dev/null

+++ b/appengine/monorail/framework/filecontent.py

@@ -0,0 +1,171 @@

+# Use of this source code is govered by a BSD-style

+# license that can be found in the LICENSE file or at

+# https://developers.google.com/open-source/licenses/bsd

+"""Utility routines for dealing with MIME types and decoding text files."""

+import itertools

+import logging

+from framework import framework_constants

+_EXTENSION_TO_CTYPE_TABLE = {

+ # These are images/PDFs that we trust the browser to display.

+ 'gif': 'image/gif',

+ 'jpg': 'image/jpeg',

+ 'jpeg': 'image/jpeg',

+ 'png': 'image/png',

+ 'ico': 'image/x-icon',

+ 'svg': 'image/svg+xml',

+ 'pdf': 'application/pdf',

+ # We do not serve mimetypes that cause the brower to launch a local

+ # app because that is not required for issue tracking and it is a

+ # potential security risk.

+def GuessContentTypeFromFilename(filename):

+ """Guess a file's content type based on the filename extension.

+ Args:

+ filename: String name of a file.

+ Returns:

+ MIME type string to use when serving this file. We only use text/plain for

+ text files, appropriate image content-types, or application/octet-stream

+ for virtually all binary files. This limits the richness of the user's

+ experience, e.g., the user cannot open an MS Office application directly

+ by clicking on an attachment, but it is safer.

+ """

+ ext = filename.split('.')[-1] if ('.' in filename) else ''

+ ext = ext.lower()

+ if ext in COMMON_TEXT_FILE_EXTENSIONS:

+ return 'text/plain'

+ return _EXTENSION_TO_CTYPE_TABLE.get(ext.lower(), 'application/octet-stream')

+# Constants used in detecting if a file has binary content.

+# All line lengths must be below the upper limit, and there must be a spefic

+# ratio below the lower limit.

+_MAX_SOURCE_LINE_LEN_LOWER = 350

+_MAX_SOURCE_LINE_LEN_UPPER = 800

+_SOURCE_LINE_LEN_LOWER_RATIO = 0.9

+# Message to display for undecodable commit log or author values.

+UNDECODABLE_LOG_CONTENT = '[Cannot be displayed]'

+# How large a repository file is in bytes before we don't try to display it

+SOURCE_FILE_MAX_SIZE = 1000 * 1024

+SOURCE_FILE_MAX_LINES = 50000

+# The source code browser will not attempt to display any filename ending

+# with one of these extensions.

+COMMON_BINARY_FILE_EXTENSIONS = {

+ 'gif', 'jpg', 'jpeg', 'psd', 'ico', 'icon', 'xbm', 'xpm', 'xwd', 'pcx',

+ 'bmp', 'png', 'vsd,' 'mpg', 'mpeg', 'wmv', 'wmf', 'avi', 'flv', 'snd',

+ 'mp3', 'wma', 'exe', 'dll', 'bin', 'class', 'o', 'so', 'lib', 'dylib',

+ 'jar', 'ear', 'war', 'par', 'msi', 'tar', 'zip', 'rar', 'cab', 'z', 'gz',

+ 'bz2', 'dmg', 'iso', 'rpm', 'pdf', 'eps', 'tif', 'tiff', 'xls', 'ppt',

+ 'graffie', 'violet',

+ }

+# The source code browser will display file contents as text data for files

+# with the following extensions or exact filenames (assuming they decode

+# correctly).

+COMMON_TEXT_FILE_EXTENSIONS = (

+ set(framework_constants.PRETTIFY_CLASS_MAP.iterkeys()) |

+ { '', 'ada', 'asm', 'asp', 'bat', 'cgi', 'csv', 'el', 'emacs',

+ 'jsp', 'log', 'markdown', 'md', 'mf', 'plist', 'properties', 'r',

+ 'rc', 'txt', 'vim', 'wiki', 'xemacs', 'yacc',

+ })

+COMMON_TEXT_FILENAMES = (

+ set(framework_constants.PRETTIFY_FILENAME_CLASS_MAP.iterkeys()) |

+ {'authors', 'install', 'readme'})

+def DecodeFileContents(file_contents, path=None):

+ """Try converting file contents to unicode using utf-8 or latin-1.

+ This is applicable to untrusted maybe-text from vcs files or inbound emails.

+ We try decoding the file as utf-8, then fall back on latin-1. In the former

+ case, we call the file a text file; in the latter case, we guess whether

+ the file is text or binary based on line length.

+ If we guess text when the file is binary, the user sees safely encoded

+ gibberish. If the other way around, the user sees a message that we will

+ not display the file.

+ TODO(jrobbins): we could try the user-supplied encoding, iff it

+ is one of the encodings that we know that we can handle.

+ Args:

+ file_contents: byte string from svn file. It could be text in almost

+ any encoding, or binary. We cannot trust the user-supplied encoding

+ in the mime-type property.

+ path: string pathname of file.

+ Returns:

+ The tuple (unicode_string, is_binary, is_long):

+ - The unicode version of the string.

+ - is_binary is true if the string could not be decoded as text.

+ - is_long is true if the file has more than SOURCE_FILE_MAX_LINES lines.

+ """

+ # If the filename is one that typically identifies a binary file, then

+ # just treat it as binary without any further analysis.

+ ext = None

+ if path and '.' in path:

+ ext = path.split('.')[-1]

+ if ext.lower() in COMMON_BINARY_FILE_EXTENSIONS:

+ # If the file is binary, we don't care about the length, since we don't

+ # show or diff it.

+ return u'', True, False

+ # If the string can be decoded as utf-8, we treat it as textual.

+ try:

+ u_str = file_contents.decode('utf-8', 'strict')

+ is_long = len(u_str.split('\n')) > SOURCE_FILE_MAX_LINES

+ return u_str, False, is_long

+ except UnicodeDecodeError:

+ logging.info('not a utf-8 file: %s bytes', len(file_contents))

+ # Fall back on latin-1. This will always succeed, since every byte maps to

+ # something in latin-1, even if that something is gibberish.

+ u_str = file_contents.decode('latin-1', 'strict')

+ lines = u_str.split('\n')

+ is_long = len(lines) > SOURCE_FILE_MAX_LINES

+ # Treat decodable files with certain filenames and/or extensions as text

+ # files. This avoids problems with common file types using our text/binary

+ # heuristic rules below.

+ if path:

+ name = path.split('/')[-1]

+ if (name.lower() in COMMON_TEXT_FILENAMES or

+ (ext and ext.lower() in COMMON_TEXT_FILE_EXTENSIONS)):

+ return u_str, False, is_long

+ # HEURISTIC: Binary files can qualify as latin-1, so we need to

+ # check further. Any real source code is going to be divided into

+ # reasonably sized lines. All lines must be below an upper character limit,

+ # and most lines must be below a lower limit. This allows some exceptions

+ # to the lower limit, but is more restrictive than just using a single

+ # large character limit.

+ is_binary = False

+ if lines:

+ lower_count = 0

+ for line in itertools.islice(lines, SOURCE_FILE_MAX_LINES):

+ size = len(line)

+ if size <= _MAX_SOURCE_LINE_LEN_LOWER:

+ lower_count += 1

+ elif size > _MAX_SOURCE_LINE_LEN_UPPER:

+ is_binary = True

+ break

+ ratio = lower_count / float(len(lines))

+ if ratio < _SOURCE_LINE_LEN_LOWER_RATIO:

+ is_binary = True

+ return u_str, is_binary, is_long

« no previous file with comments | « appengine/monorail/framework/excessiveactivity.py ('k') | appengine/monorail/framework/framework_bizobj.py » ('j') | no next file with comments »