android_webview/tools/copyright_scanner.py - Issue 622493004: [Android WebView] Rewrite copyrights scanner in Python

Side by Side Diff: android_webview/tools/copyright_scanner.py

Issue 622493004: [Android WebView] Rewrite copyrights scanner in Python (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 # Copyright 2014 The Chromium Authors. All rights reserved.

	2 # Use of this source code is governed by a BSD-style license that can be

	3 # found in the LICENSE file.

	4

	5 """Utilities for scanning source files to determine code authorship.

	6 """

	7

	8 import itertools

	9 import os

	10 import re

	11

	12

	13 def FindFiles(root_dir, start_paths_list, excluded_dirs_list):

	14 """Similar to UNIX utility find(1), searches for files in the directories.

	15 Automatically leaves out only source code files.

	16 Args:

	17 root_dir: The root directory, to which all other paths are relative.

	18 start_paths_list: The list of paths to start search from. Each path can

	19 be a file or a directory.

	20 excluded_dirs_list: The list of directories to skip.

	21 Returns:

	22 The list of source code files found, relative to \|root_dir\|.

	23 """

	24 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]

	25 def IsBlacklistedDir(d):

	26 for item in dirs_blacklist:

	27 if item in d:

	28 return True

	29 return False

	30

	31 files_whitelist_re = re.compile(

	32 r'\.(asm\|c(c\|pp\|xx)?\|h(h\|pp\|xx)?\|p(l\|m)\|xs\|sh\|php\|py(\|x)'

	33 '\|rb\|idl\|java\|el\|sc(i\|e)\|cs\|pas\|inc\|js\|pac\|html\|dtd\|xsl\|mod\|mm?'

	34 '\|tex\|mli?)$')

	35 files = []

	36

	37 base_path_len = len(root_dir)

	38 for path in start_paths_list:

	39 full_path = os.path.join(root_dir, path)

	40 if os.path.isfile(full_path):

	41 if files_whitelist_re.search(path):

	42 files.append(path)

	43 else:

	44 for dirpath, dirnames, filenames in os.walk(full_path):

	45 # Remove excluded subdirs for faster scanning.

	46 for item in dirnames[:]:

	47 if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]):

	48 dirnames.remove(item)
	mkosiba (inactive) 2014/10/02 13:14:46 the os.walk docs say: the os.walk docs say: Show quoted text > When topdown is True, the caller can modify the dirnames list in-place (perhaps using del or slice assignment), and walk() will only recurse into the subdirectories whose names remain in dirnames [...] Modifying dirnames when topdown is False is ineffective, because in bottom-up mode the directories in dirnames are generated before dirpath itself is generated. so I guess you'll want to os.walk(full_path, topdown=True) mnaganov (inactive) 2014/10/02 13:51:30 Yeah, but the same doc also lists the defaults for Show quoted text On 2014/10/02 13:14:46, mkosiba wrote: > the os.walk docs say: > > > When topdown is True, the caller can modify the dirnames list in-place > (perhaps using del or slice assignment), and walk() will only recurse into the > subdirectories whose names remain in dirnames [...] Modifying dirnames when > topdown is False is ineffective, because in bottom-up mode the directories in > dirnames are generated before dirpath itself is generated. > > so I guess you'll want to os.walk(full_path, topdown=True) Yeah, but the same doc also lists the defaults for this function: os.walk(top, topdown=True, onerror=None, followlinks=False)
	49 for filename in filenames:

	50 filepath = os.path.join(dirpath, filename)[base_path_len + 1:]

	51 if files_whitelist_re.search(filepath) and \

	52 not IsBlacklistedDir(filepath):

	53 files.append(filepath)

	54 return files

	55

	56

	57 python_multiline_string_double_re = re.compile(

	58 r'"""[^"]*(?:"""\|$)', flags=re.MULTILINE)

	59 python_multiline_string_single_re = re.compile(

	60 r"'''[^']*(?:'''\|$)", flags=re.MULTILINE)

	61 automatically_generated_re = re.compile(

	62 r'(All changes made in this file will be lost'

	63 '\|DO NOT (EDIT\|delete this file)'

	64 '\|Generated (at\|automatically\|data)'

	65 '\|Automatically generated'

	66 '\|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE)

	67

	68 def _IsGeneratedFile(header):

	69 header = header.upper()

	70 if '"""' in header:

	71 header = python_multiline_string_double_re.sub('', header)

	72 if "'''" in header:

	73 header = python_multiline_string_single_re.sub('', header)

	74 # First do simple strings lookup to save time.

	75 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:

	76 return True

	77 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \

	78 'GENERATED' in header:

	79 return automatically_generated_re.search(header)

	80 return False

	81

	82

	83 GENERATED_FILE = 'GENERATED FILE'

	84 NO_COPYRIGHT = 'No copyright'

	85

	86 class _CopyrightsScanner(object):

	87 _c_comment_re = re.compile(r'''"[^"\\](?:\\.[^"\\])*"''')

	88 _copyright_indicator = r'(?:copyright\|copr\.\|\xc2\xa9\|$c$)'

	89 _full_copyright_indicator_re = \

	90 re.compile(r'(?:\W\|^)' + _copyright_indicator + r'(?::\s\|\s+)(\w.)$', \

	91 re.IGNORECASE)

	92 _copyright_disindicator_re = \

	93 re.compile(r'\s*\b(?:info(?:rmation)?\|notice\|and\|or)\b', re.IGNORECASE)

	94

	95 def __init__(self):

	96 self.max_line_numbers_proximity = 3

	97 self.last_a_item_line_number = -200

	98 self.last_b_item_line_number = -100

	99

	100 def _CloseLineNumbers(self, a, b):

	101 return 0 <= a - b <= self.max_line_numbers_proximity

	102

	103 def MatchLine(self, line_number, line):

	104 if '"' in line:

	105 line = _CopyrightsScanner._c_comment_re.sub('', line)

	106 upcase_line = line.upper()

	107 # Record '(a)' and '(b)' last occurences in C++ comments.
	mkosiba (inactive) 2014/10/02 13:14:46 Maybe move/copy the explanation from line 121 to h Maybe move/copy the explanation from line 121 to here. It makes it easier to understand why we're recording (a) and (b) mnaganov (inactive) 2014/10/02 13:51:30 Done. Show quoted text On 2014/10/02 13:14:46, mkosiba wrote: > Maybe move/copy the explanation from line 121 to here. It makes it easier to > understand why we're recording (a) and (b) Done.
	108 cpp_comment_idx = upcase_line.find('//')

	109 if not cpp_comment_idx == -1:
	mkosiba (inactive) 2014/10/02 13:14:47 cpp_comment_idx != -1 or even '//' in upcase_line cpp_comment_idx != -1 or even '//' in upcase_line mnaganov (inactive) 2014/10/02 13:51:30 We use the value of cpp_comment_idx if it's not -1 Show quoted text On 2014/10/02 13:14:47, mkosiba wrote: > cpp_comment_idx != -1 or even '//' in upcase_line We use the value of cpp_comment_idx if it's not -1, so just changed to "!= -1".
	110 if upcase_line.find('(A)') > cpp_comment_idx:

	111 self.last_a_item_line_number = line_number

	112 if upcase_line.find('(B)') > cpp_comment_idx:

	113 self.last_b_item_line_number = line_number

	114 # Fast bailout, uses the same patterns as _copyright_indicator regexp.

	115 if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \

	116 and not '\xc2\xa9' in upcase_line:

	117 c_item_index = upcase_line.find('(C)')

	118 if c_item_index == -1:

	119 return None

	120 # Filter out 'c' used as a list item inside C++ comments.

	121 # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"

	122 if c_item_index > cpp_comment_idx and \

	123 self._CloseLineNumbers(line_number,

	124 self.last_b_item_line_number) and \

	125 self._CloseLineNumbers(self.last_b_item_line_number,

	126 self.last_a_item_line_number):

	127 return None

	128 copyr = None

	129 m = _CopyrightsScanner._full_copyright_indicator_re.search(line)

	130 if m and \

	131 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):

	132 copyr = m.group(0)

	133 # Prettify the authorship string.

	134 copyr = re.sub(r'([,.])?\s*$/', '', copyr)

	135 copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE)

	136 copyr = re.sub(r'^\s+', '', copyr)

	137 copyr = re.sub(r'\s{2,}', ' ', copyr)

	138 copyr = re.sub(r'\\@', '@', copyr)

	139 return copyr

	140

	141

	142 def FindCopyrights(root_dir, files_to_scan):

	143 """Determines code autorship, and finds generated files.

	144 Args:

	145 root_dir: The root directory, to which all other paths are relative.

	146 files_to_scan: The list of file names to scan.

	147 Returns:

	148 The list of copyrights associated with each of the files given.

	149 If the certain file is generated, the corresponding list consists a single

	150 entry -- 'GENERATED_FILE' string. If the file has no copyright info,

	151 the corresponding list contains 'NO_COPYRIGHT' string.

	152 """

	153 copyrights = []

	154 for f in files_to_scan:

	155 linenum = 0

	156 header = ''

	157 file_copyrights = []

	158 scanner = _CopyrightsScanner()

	159 for l in open(os.path.join(root_dir, f), 'r').readlines():
	mkosiba (inactive) 2014/10/02 13:14:46 umm.. you might run out of fd's if you don't close umm.. you might run out of fd's if you don't close the file? maybe use 'with' ? mnaganov (inactive) 2014/10/02 13:51:30 That's a really good catch, thanks! Fixed here and Show quoted text On 2014/10/02 13:14:46, mkosiba wrote: > umm.. you might run out of fd's if you don't close the file? maybe use 'with' ? That's a really good catch, thanks! Fixed here and also in webview_licenses.py
	160 linenum += 1

	161 if linenum <= 25:

	162 header += l

	163 c = scanner.MatchLine(linenum, l)

	164 if c:

	165 file_copyrights.append(c)

	166 if _IsGeneratedFile(header):

	167 copyrights.append([GENERATED_FILE])

	168 elif file_copyrights:

	169 copyrights.append(file_copyrights)

	170 else:

	171 copyrights.append([NO_COPYRIGHT])

	172 return copyrights

	173

	174

	175 def FindCopyrightViolations(root_dir, files_to_scan):

	176 """Looks for files that are not belong exlusively to the Chromium Authors.

	177 Args:

	178 root_dir: The root directory, to which all other paths are relative.

	179 files_to_scan: The list of file names to scan.

	180 Returns:

	181 The list of file names that contain non-Chromium copyrights.

	182 """

	183 copyrights = FindCopyrights(root_dir, files_to_scan)

	184 offending_files = []

	185 allowed_copyrights_re = re.compile(

	186 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '

	187 'All rights reserved.*)$')

	188 for f, cs in itertools.izip(files_to_scan, copyrights):

	189 if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT:

	190 continue

	191 for c in cs:

	192 if not allowed_copyrights_re.match(c):

	193 offending_files.append(os.path.normpath(f))

	194 break

	195 return offending_files

OLD	NEW

« no previous file with comments | « no previous file | android_webview/tools/find_copyrights.pl » ('j') | no next file with comments »