android_webview/tools/copyright_scanner.py - Issue 667723002: [Android WebView] Prepare the copyrights scanner to run from presubmit scripts

Side by Side Diff: android_webview/tools/copyright_scanner.py

Issue 667723002: [Android WebView] Prepare the copyrights scanner to run from presubmit scripts (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Comments addressed Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright 2014 The Chromium Authors. All rights reserved.	1 # Copyright 2014 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 """Utilities for scanning source files to determine code authorship.	5 """Utilities for scanning source files to determine code authorship.

6 """	6 """

7	7

8 import itertools	8 import itertools

9 import os

10 import re

11	9

12	10

13 def FindFiles(root_dir, start_paths_list, excluded_dirs_list):	11 def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):

14 """Similar to UNIX utility find(1), searches for files in the directories.	12 """Similar to UNIX utility find(1), searches for files in the directories.

15 Automatically leaves out only source code files.	13 Automatically leaves out only source code files.

16 Args:	14 Args:

	15 input_api: InputAPI, as in presubmit scripts.

17 root_dir: The root directory, to which all other paths are relative.	16 root_dir: The root directory, to which all other paths are relative.

18 start_paths_list: The list of paths to start search from. Each path can	17 start_paths_list: The list of paths to start search from. Each path can

19 be a file or a directory.	18 be a file or a directory.

20 excluded_dirs_list: The list of directories to skip.	19 excluded_dirs_list: The list of directories to skip.

21 Returns:	20 Returns:

22 The list of source code files found, relative to \|root_dir\|.	21 The list of source code files found, relative to \|root_dir\|.

23 """	22 """

24 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]	23 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]

25 def IsBlacklistedDir(d):	24 def IsBlacklistedDir(d):

26 for item in dirs_blacklist:	25 for item in dirs_blacklist:

27 if item in d:	26 if item in d:

28 return True	27 return True

29 return False	28 return False

30	29

31 files_whitelist_re = re.compile(	30 files_whitelist_re = input_api.re.compile(

32 r'\.(asm\|c(c\|pp\|xx)?\|h(h\|pp\|xx)?\|p(l\|m)\|xs\|sh\|php\|py(\|x)'	31 r'\.(asm\|c(c\|pp\|xx)?\|h(h\|pp\|xx)?\|p(l\|m)\|xs\|sh\|php\|py(\|x)'

33 '\|rb\|idl\|java\|el\|sc(i\|e)\|cs\|pas\|inc\|js\|pac\|html\|dtd\|xsl\|mod\|mm?'	32 '\|rb\|idl\|java\|el\|sc(i\|e)\|cs\|pas\|inc\|js\|pac\|html\|dtd\|xsl\|mod\|mm?'

34 '\|tex\|mli?)$')	33 '\|tex\|mli?)$')

35 files = []	34 files = []

36	35

37 base_path_len = len(root_dir)	36 base_path_len = len(root_dir)

38 for path in start_paths_list:	37 for path in start_paths_list:

39 full_path = os.path.join(root_dir, path)	38 full_path = input_api.os_path.join(root_dir, path)

40 if os.path.isfile(full_path):	39 if input_api.os_path.isfile(full_path):

41 if files_whitelist_re.search(path):	40 if files_whitelist_re.search(path):

42 files.append(path)	41 files.append(path)

43 else:	42 else:

44 for dirpath, dirnames, filenames in os.walk(full_path):	43 for dirpath, dirnames, filenames in input_api.os_walk(full_path):

45 # Remove excluded subdirs for faster scanning.	44 # Remove excluded subdirs for faster scanning.

46 for item in dirnames[:]:	45 for item in dirnames[:]:

47 if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]):	46 if IsBlacklistedDir(

	47 input_api.os_path.join(dirpath, item)[base_path_len + 1:]):

48 dirnames.remove(item)	48 dirnames.remove(item)

49 for filename in filenames:	49 for filename in filenames:

50 filepath = os.path.join(dirpath, filename)[base_path_len + 1:]	50 filepath = \

	51 input_api.os_path.join(dirpath, filename)[base_path_len + 1:]

51 if files_whitelist_re.search(filepath) and \	52 if files_whitelist_re.search(filepath) and \

52 not IsBlacklistedDir(filepath):	53 not IsBlacklistedDir(filepath):

53 files.append(filepath)	54 files.append(filepath)

54 return files	55 return files

55	56

56	57

57 python_multiline_string_double_re = re.compile(	58 class _GeneratedFilesDetector(object):

58 r'"""[^"]*(?:"""\|$)', flags=re.MULTILINE)	59 GENERATED_FILE = 'GENERATED FILE'

59 python_multiline_string_single_re = re.compile(	60 NO_COPYRIGHT = 'No copyright'

60 r"'''[^']*(?:'''\|$)", flags=re.MULTILINE)

61 automatically_generated_re = re.compile(

62 r'(All changes made in this file will be lost'

63 '\|DO NOT (EDIT\|delete this file)'

64 '\|Generated (at\|automatically\|data)'

65 '\|Automatically generated'

66 '\|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE)

67	61

68 def _IsGeneratedFile(header):	62 def __init__(self, input_api):

69 header = header.upper()	63 self.python_multiline_string_double_re = \

70 if '"""' in header:	64 input_api.re.compile(r'"""[^"]*(?:"""\|$)', flags=input_api.re.MULTILINE)

71 header = python_multiline_string_double_re.sub('', header)	65 self.python_multiline_string_single_re = \

72 if "'''" in header:	66 input_api.re.compile(r"'''[^']*(?:'''\|$)", flags=input_api.re.MULTILINE)

73 header = python_multiline_string_single_re.sub('', header)	67 self.automatically_generated_re = input_api.re.compile(

74 # First do simple strings lookup to save time.	68 r'(All changes made in this file will be lost'

75 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:	69 '\|DO NOT (EDIT\|delete this file)'

76 return True	70 '\|Generated (at\|automatically\|data)'

77 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \	71 '\|Automatically generated'

78 'GENERATED' in header:	72 '\|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)

79 return automatically_generated_re.search(header)	73

80 return False	74 def IsGeneratedFile(self, header):

	75 header = header.upper()

	76 if '"""' in header:

	77 header = self.python_multiline_string_double_re.sub('', header)

	78 if "'''" in header:

	79 header = self.python_multiline_string_single_re.sub('', header)

	80 # First do simple strings lookup to save time.

	81 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:

	82 return True

	83 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \

	84 'GENERATED' in header:

	85 return self.automatically_generated_re.search(header)

	86 return False

81	87

82	88

83 GENERATED_FILE = 'GENERATED FILE'	89 class _CopyrightsScanner(object):

84 NO_COPYRIGHT = 'No copyright'	90 @staticmethod

	91 def StaticInit(input_api):

	92 _CopyrightsScanner._c_comment_re = \

	93 input_api.re.compile(r'''"[^"\\](?:\\.[^"\\])*"''')

	94 _CopyrightsScanner._copyright_indicator = \

	95 r'(?:copyright\|copr\.\|\xc2\xa9\|$c$)'

	96 _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(

	97 r'(?:\W\|^)' + _CopyrightsScanner._copyright_indicator + \

	98 r'(?::\s\|\s+)(\w.)$', input_api.re.IGNORECASE)

	99 _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(

	100 r'\s*\b(?:info(?:rmation)?\|notice\|and\|or)\b', input_api.re.IGNORECASE)

85	101

86 class _CopyrightsScanner(object):	102 def __init__(self, input_api):

87 _c_comment_re = re.compile(r'''"[^"\\](?:\\.[^"\\])*"''')

88 _copyright_indicator = r'(?:copyright\|copr\.\|\xc2\xa9\|$c$)'

89 _full_copyright_indicator_re = \

90 re.compile(r'(?:\W\|^)' + _copyright_indicator + r'(?::\s\|\s+)(\w.)$', \

91 re.IGNORECASE)

92 _copyright_disindicator_re = \

93 re.compile(r'\s*\b(?:info(?:rmation)?\|notice\|and\|or)\b', re.IGNORECASE)

94

95 def __init__(self):

96 self.max_line_numbers_proximity = 3	103 self.max_line_numbers_proximity = 3

97 self.last_a_item_line_number = -200	104 self.last_a_item_line_number = -200

98 self.last_b_item_line_number = -100	105 self.last_b_item_line_number = -100

	106 self.re = input_api.re

99	107

100 def _CloseLineNumbers(self, a, b):	108 def _CloseLineNumbers(self, a, b):

101 return 0 <= a - b <= self.max_line_numbers_proximity	109 return 0 <= a - b <= self.max_line_numbers_proximity

102	110

103 def MatchLine(self, line_number, line):	111 def MatchLine(self, line_number, line):

104 if '"' in line:	112 if '"' in line:

105 line = _CopyrightsScanner._c_comment_re.sub('', line)	113 line = _CopyrightsScanner._c_comment_re.sub('', line)

106 upcase_line = line.upper()	114 upcase_line = line.upper()

107 # Record '(a)' and '(b)' last occurences in C++ comments.	115 # Record '(a)' and '(b)' last occurences in C++ comments.

108 # This is to filter out '(c)' used as a list item inside C++ comments.	116 # This is to filter out '(c)' used as a list item inside C++ comments.

(...skipping 15 matching lines...) Expand all Loading...
124 self.last_b_item_line_number) and \	132 self.last_b_item_line_number) and \

125 self._CloseLineNumbers(self.last_b_item_line_number,	133 self._CloseLineNumbers(self.last_b_item_line_number,

126 self.last_a_item_line_number):	134 self.last_a_item_line_number):

127 return None	135 return None

128 copyr = None	136 copyr = None

129 m = _CopyrightsScanner._full_copyright_indicator_re.search(line)	137 m = _CopyrightsScanner._full_copyright_indicator_re.search(line)

130 if m and \	138 if m and \

131 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):	139 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):

132 copyr = m.group(0)	140 copyr = m.group(0)

133 # Prettify the authorship string.	141 # Prettify the authorship string.

134 copyr = re.sub(r'([,.])?\s*$/', '', copyr)	142 copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)

135 copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE)	143 copyr = self.re.sub(

136 copyr = re.sub(r'^\s+', '', copyr)	144 _CopyrightsScanner._copyright_indicator, '', copyr, \

137 copyr = re.sub(r'\s{2,}', ' ', copyr)	145 flags=self.re.IGNORECASE)

138 copyr = re.sub(r'\\@', '@', copyr)	146 copyr = self.re.sub(r'^\s+', '', copyr)

	147 copyr = self.re.sub(r'\s{2,}', ' ', copyr)

	148 copyr = self.re.sub(r'\\@', '@', copyr)

139 return copyr	149 return copyr

140	150

141	151

142 def FindCopyrights(root_dir, files_to_scan):	152 def FindCopyrights(input_api, root_dir, files_to_scan):

143 """Determines code autorship, and finds generated files.	153 """Determines code autorship, and finds generated files.

144 Args:	154 Args:

	155 input_api: InputAPI, as in presubmit scripts.

145 root_dir: The root directory, to which all other paths are relative.	156 root_dir: The root directory, to which all other paths are relative.

146 files_to_scan: The list of file names to scan.	157 files_to_scan: The list of file names to scan.

147 Returns:	158 Returns:

148 The list of copyrights associated with each of the files given.	159 The list of copyrights associated with each of the files given.

149 If the certain file is generated, the corresponding list consists a single	160 If the certain file is generated, the corresponding list consists a single

150 entry -- 'GENERATED_FILE' string. If the file has no copyright info,	161 entry -- 'GENERATED_FILE' string. If the file has no copyright info,

151 the corresponding list contains 'NO_COPYRIGHT' string.	162 the corresponding list contains 'NO_COPYRIGHT' string.

152 """	163 """

	164 generated_files_detector = _GeneratedFilesDetector(input_api)

	165 _CopyrightsScanner.StaticInit(input_api)

153 copyrights = []	166 copyrights = []

154 for file_name in files_to_scan:	167 for file_name in files_to_scan:

155 linenum = 0	168 linenum = 0

156 header = ''	169 header = []

157 file_copyrights = []	170 file_copyrights = []

158 scanner = _CopyrightsScanner()	171 scanner = _CopyrightsScanner(input_api)

159 with open(os.path.join(root_dir, file_name), 'r') as f:	172 contents = input_api.ReadFile(

160 for l in f.readlines():	173 input_api.os_path.join(root_dir, file_name), 'r')

161 linenum += 1	174 for l in contents.split('\n'):

162 if linenum <= 25:	175 linenum += 1

163 header += l	176 if linenum <= 25:

164 c = scanner.MatchLine(linenum, l)	177 header.append(l)

165 if c:	178 c = scanner.MatchLine(linenum, l)

166 file_copyrights.append(c)	179 if c:

167 if _IsGeneratedFile(header):	180 file_copyrights.append(c)

168 copyrights.append([GENERATED_FILE])	181 if generated_files_detector.IsGeneratedFile('\n'.join(header)):

169 elif file_copyrights:	182 copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])

170 copyrights.append(file_copyrights)	183 elif file_copyrights:

171 else:	184 copyrights.append(file_copyrights)

172 copyrights.append([NO_COPYRIGHT])	185 else:

	186 copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])

173 return copyrights	187 return copyrights

174	188

175	189

176 def FindCopyrightViolations(root_dir, files_to_scan):	190 def FindCopyrightViolations(input_api, root_dir, files_to_scan):

177 """Looks for files that are not belong exlusively to the Chromium Authors.	191 """Looks for files that are not belong exlusively to the Chromium Authors.

178 Args:	192 Args:

	193 input_api: InputAPI, as in presubmit scripts.

179 root_dir: The root directory, to which all other paths are relative.	194 root_dir: The root directory, to which all other paths are relative.

180 files_to_scan: The list of file names to scan.	195 files_to_scan: The list of file names to scan.

181 Returns:	196 Returns:

182 The list of file names that contain non-Chromium copyrights.	197 The list of file names that contain non-Chromium copyrights.

183 """	198 """

184 copyrights = FindCopyrights(root_dir, files_to_scan)	199 copyrights = FindCopyrights(input_api, root_dir, files_to_scan)

185 offending_files = []	200 offending_files = []

186 allowed_copyrights_re = re.compile(	201 allowed_copyrights_re = input_api.re.compile(

187 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '	202 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '

188 'All rights reserved.*)$')	203 'All rights reserved.*)$')

189 for f, cs in itertools.izip(files_to_scan, copyrights):	204 for f, cs in itertools.izip(files_to_scan, copyrights):

190 if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT:	205 if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \

	206 cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:

191 continue	207 continue

192 for c in cs:	208 for c in cs:

193 if not allowed_copyrights_re.match(c):	209 if not allowed_copyrights_re.match(c):

194 offending_files.append(os.path.normpath(f))	210 offending_files.append(input_api.os_path.normpath(f))

195 break	211 break

196 return offending_files	212 return offending_files

OLD	NEW