appengine/monorail/framework/filecontent.py - Issue 1868553004: Open Source Monorail

Side by Side Diff: appengine/monorail/framework/filecontent.py

Issue 1868553004: Open Source Monorail (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Rebase Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 # Copyright 2016 The Chromium Authors. All rights reserved.

	2 # Use of this source code is govered by a BSD-style

	3 # license that can be found in the LICENSE file or at

	4 # https://developers.google.com/open-source/licenses/bsd

	5

	6 """Utility routines for dealing with MIME types and decoding text files."""

	7

	8 import itertools

	9 import logging

	10

	11 from framework import framework_constants

	12

	13

	14 _EXTENSION_TO_CTYPE_TABLE = {

	15 # These are images/PDFs that we trust the browser to display.

	16 'gif': 'image/gif',

	17 'jpg': 'image/jpeg',

	18 'jpeg': 'image/jpeg',

	19 'png': 'image/png',

	20 'ico': 'image/x-icon',

	21 'svg': 'image/svg+xml',

	22 'pdf': 'application/pdf',

	23

	24 # We do not serve mimetypes that cause the brower to launch a local

	25 # app because that is not required for issue tracking and it is a

	26 # potential security risk.

	27 }

	28

	29

	30 def GuessContentTypeFromFilename(filename):

	31 """Guess a file's content type based on the filename extension.

	32

	33 Args:

	34 filename: String name of a file.

	35

	36 Returns:

	37 MIME type string to use when serving this file. We only use text/plain for

	38 text files, appropriate image content-types, or application/octet-stream

	39 for virtually all binary files. This limits the richness of the user's

	40 experience, e.g., the user cannot open an MS Office application directly

	41 by clicking on an attachment, but it is safer.

	42 """

	43 ext = filename.split('.')[-1] if ('.' in filename) else ''

	44 ext = ext.lower()

	45 if ext in COMMON_TEXT_FILE_EXTENSIONS:

	46 return 'text/plain'

	47 return _EXTENSION_TO_CTYPE_TABLE.get(ext.lower(), 'application/octet-stream')

	48

	49

	50 # Constants used in detecting if a file has binary content.

	51 # All line lengths must be below the upper limit, and there must be a spefic

	52 # ratio below the lower limit.

	53 _MAX_SOURCE_LINE_LEN_LOWER = 350

	54 _MAX_SOURCE_LINE_LEN_UPPER = 800

	55 _SOURCE_LINE_LEN_LOWER_RATIO = 0.9

	56

	57 # Message to display for undecodable commit log or author values.

	58 UNDECODABLE_LOG_CONTENT = '[Cannot be displayed]'

	59

	60 # How large a repository file is in bytes before we don't try to display it

	61 SOURCE_FILE_MAX_SIZE = 1000 * 1024

	62 SOURCE_FILE_MAX_LINES = 50000

	63

	64 # The source code browser will not attempt to display any filename ending

	65 # with one of these extensions.

	66 COMMON_BINARY_FILE_EXTENSIONS = {

	67 'gif', 'jpg', 'jpeg', 'psd', 'ico', 'icon', 'xbm', 'xpm', 'xwd', 'pcx',

	68 'bmp', 'png', 'vsd,' 'mpg', 'mpeg', 'wmv', 'wmf', 'avi', 'flv', 'snd',

	69 'mp3', 'wma', 'exe', 'dll', 'bin', 'class', 'o', 'so', 'lib', 'dylib',

	70 'jar', 'ear', 'war', 'par', 'msi', 'tar', 'zip', 'rar', 'cab', 'z', 'gz',

	71 'bz2', 'dmg', 'iso', 'rpm', 'pdf', 'eps', 'tif', 'tiff', 'xls', 'ppt',

	72 'graffie', 'violet',

	73 }

	74

	75 # The source code browser will display file contents as text data for files

	76 # with the following extensions or exact filenames (assuming they decode

	77 # correctly).

	78 COMMON_TEXT_FILE_EXTENSIONS = (

	79 set(framework_constants.PRETTIFY_CLASS_MAP.iterkeys()) \|

	80 { '', 'ada', 'asm', 'asp', 'bat', 'cgi', 'csv', 'el', 'emacs',

	81 'jsp', 'log', 'markdown', 'md', 'mf', 'plist', 'properties', 'r',

	82 'rc', 'txt', 'vim', 'wiki', 'xemacs', 'yacc',

	83 })

	84 COMMON_TEXT_FILENAMES = (

	85 set(framework_constants.PRETTIFY_FILENAME_CLASS_MAP.iterkeys()) \|

	86 {'authors', 'install', 'readme'})

	87

	88

	89 def DecodeFileContents(file_contents, path=None):

	90 """Try converting file contents to unicode using utf-8 or latin-1.

	91

	92 This is applicable to untrusted maybe-text from vcs files or inbound emails.

	93

	94 We try decoding the file as utf-8, then fall back on latin-1. In the former

	95 case, we call the file a text file; in the latter case, we guess whether

	96 the file is text or binary based on line length.

	97

	98 If we guess text when the file is binary, the user sees safely encoded

	99 gibberish. If the other way around, the user sees a message that we will

	100 not display the file.

	101

	102 TODO(jrobbins): we could try the user-supplied encoding, iff it

	103 is one of the encodings that we know that we can handle.

	104

	105 Args:

	106 file_contents: byte string from svn file. It could be text in almost

	107 any encoding, or binary. We cannot trust the user-supplied encoding

	108 in the mime-type property.

	109 path: string pathname of file.

	110

	111 Returns:

	112 The tuple (unicode_string, is_binary, is_long):

	113 - The unicode version of the string.

	114 - is_binary is true if the string could not be decoded as text.

	115 - is_long is true if the file has more than SOURCE_FILE_MAX_LINES lines.

	116 """

	117 # If the filename is one that typically identifies a binary file, then

	118 # just treat it as binary without any further analysis.

	119 ext = None

	120 if path and '.' in path:

	121 ext = path.split('.')[-1]

	122 if ext.lower() in COMMON_BINARY_FILE_EXTENSIONS:

	123 # If the file is binary, we don't care about the length, since we don't

	124 # show or diff it.

	125 return u'', True, False

	126

	127 # If the string can be decoded as utf-8, we treat it as textual.

	128 try:

	129 u_str = file_contents.decode('utf-8', 'strict')

	130 is_long = len(u_str.split('\n')) > SOURCE_FILE_MAX_LINES

	131 return u_str, False, is_long

	132 except UnicodeDecodeError:

	133 logging.info('not a utf-8 file: %s bytes', len(file_contents))

	134

	135 # Fall back on latin-1. This will always succeed, since every byte maps to

	136 # something in latin-1, even if that something is gibberish.

	137 u_str = file_contents.decode('latin-1', 'strict')

	138

	139 lines = u_str.split('\n')

	140 is_long = len(lines) > SOURCE_FILE_MAX_LINES

	141 # Treat decodable files with certain filenames and/or extensions as text

	142 # files. This avoids problems with common file types using our text/binary

	143 # heuristic rules below.

	144 if path:

	145 name = path.split('/')[-1]

	146 if (name.lower() in COMMON_TEXT_FILENAMES or

	147 (ext and ext.lower() in COMMON_TEXT_FILE_EXTENSIONS)):

	148 return u_str, False, is_long

	149

	150 # HEURISTIC: Binary files can qualify as latin-1, so we need to

	151 # check further. Any real source code is going to be divided into

	152 # reasonably sized lines. All lines must be below an upper character limit,

	153 # and most lines must be below a lower limit. This allows some exceptions

	154 # to the lower limit, but is more restrictive than just using a single

	155 # large character limit.

	156 is_binary = False

	157 if lines:

	158 lower_count = 0

	159 for line in itertools.islice(lines, SOURCE_FILE_MAX_LINES):

	160 size = len(line)

	161 if size <= _MAX_SOURCE_LINE_LEN_LOWER:

	162 lower_count += 1

	163 elif size > _MAX_SOURCE_LINE_LEN_UPPER:

	164 is_binary = True

	165 break

	166

	167 ratio = lower_count / float(len(lines))

	168 if ratio < _SOURCE_LINE_LEN_LOWER_RATIO:

	169 is_binary = True

	170

	171 return u_str, is_binary, is_long

OLD	NEW

« no previous file with comments | « appengine/monorail/framework/excessiveactivity.py ('k') | appengine/monorail/framework/framework_bizobj.py » ('j') | no next file with comments »