android_webview/tools/copyright_scanner.py - Issue 667723002: [Android WebView] Prepare the copyrights scanner to run from presubmit scripts

Side by Side Diff: android_webview/tools/copyright_scanner.py

Issue 667723002: [Android WebView] Prepare the copyrights scanner to run from presubmit scripts (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Add a removed empty line Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright 2014 The Chromium Authors. All rights reserved.	1 # Copyright 2014 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 """Utilities for scanning source files to determine code authorship.	5 """Utilities for scanning source files to determine code authorship.

6 """	6 """

7	7

8 import itertools	8 import itertools

9 import os

10 import re

11	9

12	10

13 def FindFiles(root_dir, start_paths_list, excluded_dirs_list):	11 def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):

14 """Similar to UNIX utility find(1), searches for files in the directories.	12 """Similar to UNIX utility find(1), searches for files in the directories.

15 Automatically leaves out only source code files.	13 Automatically leaves out only source code files.

16 Args:	14 Args:

	15 input_api: InputAPI, as in presubmit scripts.

17 root_dir: The root directory, to which all other paths are relative.	16 root_dir: The root directory, to which all other paths are relative.

18 start_paths_list: The list of paths to start search from. Each path can	17 start_paths_list: The list of paths to start search from. Each path can

19 be a file or a directory.	18 be a file or a directory.

20 excluded_dirs_list: The list of directories to skip.	19 excluded_dirs_list: The list of directories to skip.

21 Returns:	20 Returns:

22 The list of source code files found, relative to \|root_dir\|.	21 The list of source code files found, relative to \|root_dir\|.

23 """	22 """

24 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]	23 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]

25 def IsBlacklistedDir(d):	24 def IsBlacklistedDir(d):

26 for item in dirs_blacklist:	25 for item in dirs_blacklist:

27 if item in d:	26 if item in d:

28 return True	27 return True

29 return False	28 return False

30	29

31 files_whitelist_re = re.compile(	30 files_whitelist_re = input_api.re.compile(

32 r'\.(asm\|c(c\|pp\|xx)?\|h(h\|pp\|xx)?\|p(l\|m)\|xs\|sh\|php\|py(\|x)'	31 r'\.(asm\|c(c\|pp\|xx)?\|h(h\|pp\|xx)?\|p(l\|m)\|xs\|sh\|php\|py(\|x)'

33 '\|rb\|idl\|java\|el\|sc(i\|e)\|cs\|pas\|inc\|js\|pac\|html\|dtd\|xsl\|mod\|mm?'	32 '\|rb\|idl\|java\|el\|sc(i\|e)\|cs\|pas\|inc\|js\|pac\|html\|dtd\|xsl\|mod\|mm?'

34 '\|tex\|mli?)$')	33 '\|tex\|mli?)$')

35 files = []	34 files = []

36	35

37 base_path_len = len(root_dir)	36 base_path_len = len(root_dir)

38 for path in start_paths_list:	37 for path in start_paths_list:

39 full_path = os.path.join(root_dir, path)	38 full_path = input_api.os_path.join(root_dir, path)

40 if os.path.isfile(full_path):	39 if input_api.os_path.isfile(full_path):

41 if files_whitelist_re.search(path):	40 if files_whitelist_re.search(path):

42 files.append(path)	41 files.append(path)

43 else:	42 else:

44 for dirpath, dirnames, filenames in os.walk(full_path):	43 for dirpath, dirnames, filenames in input_api.os_walk(full_path):

45 # Remove excluded subdirs for faster scanning.	44 # Remove excluded subdirs for faster scanning.

46 for item in dirnames[:]:	45 for item in dirnames[:]:

47 if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]):	46 if IsBlacklistedDir(

	47 input_api.os_path.join(dirpath, item)[base_path_len + 1:]):

48 dirnames.remove(item)	48 dirnames.remove(item)

49 for filename in filenames:	49 for filename in filenames:

50 filepath = os.path.join(dirpath, filename)[base_path_len + 1:]	50 filepath = \

	51 input_api.os_path.join(dirpath, filename)[base_path_len + 1:]

51 if files_whitelist_re.search(filepath) and \	52 if files_whitelist_re.search(filepath) and \

52 not IsBlacklistedDir(filepath):	53 not IsBlacklistedDir(filepath):

53 files.append(filepath)	54 files.append(filepath)

54 return files	55 return files

55	56

56	57

57 python_multiline_string_double_re = re.compile(	58 class _GeneratedFilesDetector(object):

58 r'"""[^"]*(?:"""\|$)', flags=re.MULTILINE)	59 GENERATED_FILE = 'GENERATED FILE'

59 python_multiline_string_single_re = re.compile(	60 NO_COPYRIGHT = 'No copyright'

60 r"'''[^']*(?:'''\|$)", flags=re.MULTILINE)

61 automatically_generated_re = re.compile(

62 r'(All changes made in this file will be lost'

63 '\|DO NOT (EDIT\|delete this file)'

64 '\|Generated (at\|automatically\|data)'

65 '\|Automatically generated'

66 '\|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE)

67	61

68 def _IsGeneratedFile(header):	62 @staticmethod

69 header = header.upper()	63 def StaticInit(input_api):

70 if '"""' in header:	64 _GeneratedFilesDetector.python_multiline_string_double_re = \
	mkosiba (inactive) 2014/10/21 15:41:22 wouldn't it be simpler to have these be instance m wouldn't it be simpler to have these be instance methods and pass input_api to the _GeneratedFilesDetector as an argument? I think you'd end up with at most one instance per script invocation, right? Also, I think at least theoretically you should keep the invariant that input_api doesn't 'leak' outside of the callstack, right? mnaganov (inactive) 2014/10/22 09:27:29 Done. But it's different for _CopyrightsScanner, Show quoted text On 2014/10/21 15:41:22, mkosiba wrote: > wouldn't it be simpler to have these be instance methods and pass input_api to > the _GeneratedFilesDetector as an argument? I think you'd end up with at most > one instance per script invocation, right? > Done. But it's different for _CopyrightsScanner, as I spin up a new instance per scanned file (as it has per-file state). But _CopyrightScanner as a class is only used once. I could come up with a "factory" class that emits "scanners", but I think this will add unnecessary complexity. Show quoted text > Also, I think at least theoretically you should keep the invariant that > input_api doesn't 'leak' outside of the callstack, right? It never does. I don't store references to input_api itself.
71 header = python_multiline_string_double_re.sub('', header)	65 input_api.re.compile(r'"""[^"]*(?:"""\|$)', flags=input_api.re.MULTILINE)

72 if "'''" in header:	66 _GeneratedFilesDetector.python_multiline_string_single_re = \

73 header = python_multiline_string_single_re.sub('', header)	67 input_api.re.compile(r"'''[^']*(?:'''\|$)", flags=input_api.re.MULTILINE)

74 # First do simple strings lookup to save time.	68 _GeneratedFilesDetector.automatically_generated_re = input_api.re.compile(

75 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:	69 r'(All changes made in this file will be lost'

76 return True	70 '\|DO NOT (EDIT\|delete this file)'

77 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \	71 '\|Generated (at\|automatically\|data)'

78 'GENERATED' in header:	72 '\|Automatically generated'

79 return automatically_generated_re.search(header)	73 '\|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)

80 return False	74

	75 @staticmethod

	76 def _IsGeneratedFile(header):

	77 header = header.upper()

	78 if '"""' in header:

	79 header = _GeneratedFilesDetector.python_multiline_string_double_re.sub(

	80 '', header)

	81 if "'''" in header:

	82 header = _GeneratedFilesDetector.python_multiline_string_single_re.sub(

	83 '', header)

	84 # First do simple strings lookup to save time.

	85 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:

	86 return True

	87 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \

	88 'GENERATED' in header:

	89 return _GeneratedFilesDetector.automatically_generated_re.search(header)

	90 return False

81	91

82	92

83 GENERATED_FILE = 'GENERATED FILE'	93 class _CopyrightsScanner(object):

84 NO_COPYRIGHT = 'No copyright'	94 @staticmethod

	95 def StaticInit(input_api):

	96 _CopyrightsScanner._c_comment_re = \

	97 input_api.re.compile(r'''"[^"\\](?:\\.[^"\\])*"''')

	98 _CopyrightsScanner._copyright_indicator = \

	99 r'(?:copyright\|copr\.\|\xc2\xa9\|$c$)'

	100 _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(

	101 r'(?:\W\|^)' + _CopyrightsScanner._copyright_indicator + \

	102 r'(?::\s\|\s+)(\w.)$', input_api.re.IGNORECASE)

	103 _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(

	104 r'\s*\b(?:info(?:rmation)?\|notice\|and\|or)\b', input_api.re.IGNORECASE)

85	105

86 class _CopyrightsScanner(object):	106 def __init__(self, input_api):

87 _c_comment_re = re.compile(r'''"[^"\\](?:\\.[^"\\])*"''')

88 _copyright_indicator = r'(?:copyright\|copr\.\|\xc2\xa9\|$c$)'

89 _full_copyright_indicator_re = \

90 re.compile(r'(?:\W\|^)' + _copyright_indicator + r'(?::\s\|\s+)(\w.)$', \

91 re.IGNORECASE)

92 _copyright_disindicator_re = \

93 re.compile(r'\s*\b(?:info(?:rmation)?\|notice\|and\|or)\b', re.IGNORECASE)

94

95 def __init__(self):

96 self.max_line_numbers_proximity = 3	107 self.max_line_numbers_proximity = 3

97 self.last_a_item_line_number = -200	108 self.last_a_item_line_number = -200

98 self.last_b_item_line_number = -100	109 self.last_b_item_line_number = -100

	110 self.re = input_api.re

99	111

100 def _CloseLineNumbers(self, a, b):	112 def _CloseLineNumbers(self, a, b):

101 return 0 <= a - b <= self.max_line_numbers_proximity	113 return 0 <= a - b <= self.max_line_numbers_proximity

102	114

103 def MatchLine(self, line_number, line):	115 def MatchLine(self, line_number, line):

104 if '"' in line:	116 if '"' in line:

105 line = _CopyrightsScanner._c_comment_re.sub('', line)	117 line = _CopyrightsScanner._c_comment_re.sub('', line)

106 upcase_line = line.upper()	118 upcase_line = line.upper()

107 # Record '(a)' and '(b)' last occurences in C++ comments.	119 # Record '(a)' and '(b)' last occurences in C++ comments.

108 # This is to filter out '(c)' used as a list item inside C++ comments.	120 # This is to filter out '(c)' used as a list item inside C++ comments.

(...skipping 15 matching lines...) Expand all Loading...
124 self.last_b_item_line_number) and \	136 self.last_b_item_line_number) and \

125 self._CloseLineNumbers(self.last_b_item_line_number,	137 self._CloseLineNumbers(self.last_b_item_line_number,

126 self.last_a_item_line_number):	138 self.last_a_item_line_number):

127 return None	139 return None

128 copyr = None	140 copyr = None

129 m = _CopyrightsScanner._full_copyright_indicator_re.search(line)	141 m = _CopyrightsScanner._full_copyright_indicator_re.search(line)

130 if m and \	142 if m and \

131 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):	143 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):

132 copyr = m.group(0)	144 copyr = m.group(0)

133 # Prettify the authorship string.	145 # Prettify the authorship string.

134 copyr = re.sub(r'([,.])?\s*$/', '', copyr)	146 copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)

135 copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE)	147 copyr = self.re.sub(

136 copyr = re.sub(r'^\s+', '', copyr)	148 _CopyrightsScanner._copyright_indicator, '', copyr, \

137 copyr = re.sub(r'\s{2,}', ' ', copyr)	149 flags=self.re.IGNORECASE)

138 copyr = re.sub(r'\\@', '@', copyr)	150 copyr = self.re.sub(r'^\s+', '', copyr)

	151 copyr = self.re.sub(r'\s{2,}', ' ', copyr)

	152 copyr = self.re.sub(r'\\@', '@', copyr)

139 return copyr	153 return copyr

140	154

141	155

142 def FindCopyrights(root_dir, files_to_scan):	156 def FindCopyrights(input_api, root_dir, files_to_scan):

143 """Determines code autorship, and finds generated files.	157 """Determines code autorship, and finds generated files.

144 Args:	158 Args:

	159 input_api: InputAPI, as in presubmit scripts.

145 root_dir: The root directory, to which all other paths are relative.	160 root_dir: The root directory, to which all other paths are relative.

146 files_to_scan: The list of file names to scan.	161 files_to_scan: The list of file names to scan.

147 Returns:	162 Returns:

148 The list of copyrights associated with each of the files given.	163 The list of copyrights associated with each of the files given.

149 If the certain file is generated, the corresponding list consists a single	164 If the certain file is generated, the corresponding list consists a single

150 entry -- 'GENERATED_FILE' string. If the file has no copyright info,	165 entry -- 'GENERATED_FILE' string. If the file has no copyright info,

151 the corresponding list contains 'NO_COPYRIGHT' string.	166 the corresponding list contains 'NO_COPYRIGHT' string.

152 """	167 """

	168 _GeneratedFilesDetector.StaticInit(input_api)

	169 _CopyrightsScanner.StaticInit(input_api)

153 copyrights = []	170 copyrights = []

154 for file_name in files_to_scan:	171 for file_name in files_to_scan:

155 linenum = 0	172 linenum = 0

156 header = ''	173 header = []

157 file_copyrights = []	174 file_copyrights = []

158 scanner = _CopyrightsScanner()	175 scanner = _CopyrightsScanner(input_api)

159 with open(os.path.join(root_dir, file_name), 'r') as f:	176 contents = input_api.ReadFile(

160 for l in f.readlines():	177 input_api.os_path.join(root_dir, file_name), 'rb')
	mkosiba (inactive) 2014/10/21 15:41:22 is 'rb' intentional? is 'rb' intentional? mnaganov (inactive) 2014/10/22 09:27:29 Not sure :) Let's stick with 'r', as before. Show quoted text On 2014/10/21 15:41:22, mkosiba wrote: > is 'rb' intentional? Not sure :) Let's stick with 'r', as before.
161 linenum += 1	178 for l in contents.split('\n'):

162 if linenum <= 25:	179 linenum += 1

163 header += l	180 if linenum <= 25:

164 c = scanner.MatchLine(linenum, l)	181 header.append(l)

165 if c:	182 c = scanner.MatchLine(linenum, l)

166 file_copyrights.append(c)	183 if c:

167 if _IsGeneratedFile(header):	184 file_copyrights.append(c)

168 copyrights.append([GENERATED_FILE])	185 if _GeneratedFilesDetector._IsGeneratedFile('\n'.join(header)):

169 elif file_copyrights:	186 copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])

170 copyrights.append(file_copyrights)	187 elif file_copyrights:

171 else:	188 copyrights.append(file_copyrights)

172 copyrights.append([NO_COPYRIGHT])	189 else:

	190 copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])

173 return copyrights	191 return copyrights

174	192

175	193

176 def FindCopyrightViolations(root_dir, files_to_scan):	194 def FindCopyrightViolations(input_api, root_dir, files_to_scan):

177 """Looks for files that are not belong exlusively to the Chromium Authors.	195 """Looks for files that are not belong exlusively to the Chromium Authors.

178 Args:	196 Args:

	197 input_api: InputAPI, as in presubmit scripts.

179 root_dir: The root directory, to which all other paths are relative.	198 root_dir: The root directory, to which all other paths are relative.

180 files_to_scan: The list of file names to scan.	199 files_to_scan: The list of file names to scan.

181 Returns:	200 Returns:

182 The list of file names that contain non-Chromium copyrights.	201 The list of file names that contain non-Chromium copyrights.

183 """	202 """

184 copyrights = FindCopyrights(root_dir, files_to_scan)	203 copyrights = FindCopyrights(input_api, root_dir, files_to_scan)

185 offending_files = []	204 offending_files = []

186 allowed_copyrights_re = re.compile(	205 allowed_copyrights_re = input_api.re.compile(

187 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '	206 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '

188 'All rights reserved.*)$')	207 'All rights reserved.*)$')

189 for f, cs in itertools.izip(files_to_scan, copyrights):	208 for f, cs in itertools.izip(files_to_scan, copyrights):

190 if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT:	209 if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \

	210 cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:

191 continue	211 continue

192 for c in cs:	212 for c in cs:

193 if not allowed_copyrights_re.match(c):	213 if not allowed_copyrights_re.match(c):

194 offending_files.append(os.path.normpath(f))	214 offending_files.append(input_api.os_path.normpath(f))

195 break	215 break

196 return offending_files	216 return offending_files

OLD	NEW