Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(384)

Side by Side Diff: android_webview/tools/copyright_scanner.py

Issue 667723002: [Android WebView] Prepare the copyrights scanner to run from presubmit scripts (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Comments addressed Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2014 The Chromium Authors. All rights reserved. 1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 """Utilities for scanning source files to determine code authorship. 5 """Utilities for scanning source files to determine code authorship.
6 """ 6 """
7 7
8 import itertools 8 import itertools
9 import os
10 import re
11 9
12 10
13 def FindFiles(root_dir, start_paths_list, excluded_dirs_list): 11 def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):
14 """Similar to UNIX utility find(1), searches for files in the directories. 12 """Similar to UNIX utility find(1), searches for files in the directories.
15 Automatically leaves out only source code files. 13 Automatically leaves out only source code files.
16 Args: 14 Args:
15 input_api: InputAPI, as in presubmit scripts.
17 root_dir: The root directory, to which all other paths are relative. 16 root_dir: The root directory, to which all other paths are relative.
18 start_paths_list: The list of paths to start search from. Each path can 17 start_paths_list: The list of paths to start search from. Each path can
19 be a file or a directory. 18 be a file or a directory.
20 excluded_dirs_list: The list of directories to skip. 19 excluded_dirs_list: The list of directories to skip.
21 Returns: 20 Returns:
22 The list of source code files found, relative to |root_dir|. 21 The list of source code files found, relative to |root_dir|.
23 """ 22 """
24 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list] 23 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]
25 def IsBlacklistedDir(d): 24 def IsBlacklistedDir(d):
26 for item in dirs_blacklist: 25 for item in dirs_blacklist:
27 if item in d: 26 if item in d:
28 return True 27 return True
29 return False 28 return False
30 29
31 files_whitelist_re = re.compile( 30 files_whitelist_re = input_api.re.compile(
32 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' 31 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
33 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' 32 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
34 '|tex|mli?)$') 33 '|tex|mli?)$')
35 files = [] 34 files = []
36 35
37 base_path_len = len(root_dir) 36 base_path_len = len(root_dir)
38 for path in start_paths_list: 37 for path in start_paths_list:
39 full_path = os.path.join(root_dir, path) 38 full_path = input_api.os_path.join(root_dir, path)
40 if os.path.isfile(full_path): 39 if input_api.os_path.isfile(full_path):
41 if files_whitelist_re.search(path): 40 if files_whitelist_re.search(path):
42 files.append(path) 41 files.append(path)
43 else: 42 else:
44 for dirpath, dirnames, filenames in os.walk(full_path): 43 for dirpath, dirnames, filenames in input_api.os_walk(full_path):
45 # Remove excluded subdirs for faster scanning. 44 # Remove excluded subdirs for faster scanning.
46 for item in dirnames[:]: 45 for item in dirnames[:]:
47 if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]): 46 if IsBlacklistedDir(
47 input_api.os_path.join(dirpath, item)[base_path_len + 1:]):
48 dirnames.remove(item) 48 dirnames.remove(item)
49 for filename in filenames: 49 for filename in filenames:
50 filepath = os.path.join(dirpath, filename)[base_path_len + 1:] 50 filepath = \
51 input_api.os_path.join(dirpath, filename)[base_path_len + 1:]
51 if files_whitelist_re.search(filepath) and \ 52 if files_whitelist_re.search(filepath) and \
52 not IsBlacklistedDir(filepath): 53 not IsBlacklistedDir(filepath):
53 files.append(filepath) 54 files.append(filepath)
54 return files 55 return files
55 56
56 57
57 python_multiline_string_double_re = re.compile( 58 class _GeneratedFilesDetector(object):
58 r'"""[^"]*(?:"""|$)', flags=re.MULTILINE) 59 GENERATED_FILE = 'GENERATED FILE'
59 python_multiline_string_single_re = re.compile( 60 NO_COPYRIGHT = '*No copyright*'
60 r"'''[^']*(?:'''|$)", flags=re.MULTILINE)
61 automatically_generated_re = re.compile(
62 r'(All changes made in this file will be lost'
63 '|DO NOT (EDIT|delete this file)'
64 '|Generated (at|automatically|data)'
65 '|Automatically generated'
66 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE)
67 61
68 def _IsGeneratedFile(header): 62 def __init__(self, input_api):
69 header = header.upper() 63 self.python_multiline_string_double_re = \
70 if '"""' in header: 64 input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE)
71 header = python_multiline_string_double_re.sub('', header) 65 self.python_multiline_string_single_re = \
72 if "'''" in header: 66 input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE)
73 header = python_multiline_string_single_re.sub('', header) 67 self.automatically_generated_re = input_api.re.compile(
74 # First do simple strings lookup to save time. 68 r'(All changes made in this file will be lost'
75 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header: 69 '|DO NOT (EDIT|delete this file)'
76 return True 70 '|Generated (at|automatically|data)'
77 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \ 71 '|Automatically generated'
78 'GENERATED' in header: 72 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)
79 return automatically_generated_re.search(header) 73
80 return False 74 def IsGeneratedFile(self, header):
75 header = header.upper()
76 if '"""' in header:
77 header = self.python_multiline_string_double_re.sub('', header)
78 if "'''" in header:
79 header = self.python_multiline_string_single_re.sub('', header)
80 # First do simple strings lookup to save time.
81 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
82 return True
83 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
84 'GENERATED' in header:
85 return self.automatically_generated_re.search(header)
86 return False
81 87
82 88
83 GENERATED_FILE = 'GENERATED FILE' 89 class _CopyrightsScanner(object):
84 NO_COPYRIGHT = '*No copyright*' 90 @staticmethod
91 def StaticInit(input_api):
92 _CopyrightsScanner._c_comment_re = \
93 input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
94 _CopyrightsScanner._copyright_indicator = \
95 r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
96 _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(
97 r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \
98 r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE)
99 _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(
100 r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE)
85 101
86 class _CopyrightsScanner(object): 102 def __init__(self, input_api):
87 _c_comment_re = re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
88 _copyright_indicator = r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
89 _full_copyright_indicator_re = \
90 re.compile(r'(?:\W|^)' + _copyright_indicator + r'(?::\s*|\s+)(\w.*)$', \
91 re.IGNORECASE)
92 _copyright_disindicator_re = \
93 re.compile(r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', re.IGNORECASE)
94
95 def __init__(self):
96 self.max_line_numbers_proximity = 3 103 self.max_line_numbers_proximity = 3
97 self.last_a_item_line_number = -200 104 self.last_a_item_line_number = -200
98 self.last_b_item_line_number = -100 105 self.last_b_item_line_number = -100
106 self.re = input_api.re
99 107
100 def _CloseLineNumbers(self, a, b): 108 def _CloseLineNumbers(self, a, b):
101 return 0 <= a - b <= self.max_line_numbers_proximity 109 return 0 <= a - b <= self.max_line_numbers_proximity
102 110
103 def MatchLine(self, line_number, line): 111 def MatchLine(self, line_number, line):
104 if '"' in line: 112 if '"' in line:
105 line = _CopyrightsScanner._c_comment_re.sub('', line) 113 line = _CopyrightsScanner._c_comment_re.sub('', line)
106 upcase_line = line.upper() 114 upcase_line = line.upper()
107 # Record '(a)' and '(b)' last occurences in C++ comments. 115 # Record '(a)' and '(b)' last occurences in C++ comments.
108 # This is to filter out '(c)' used as a list item inside C++ comments. 116 # This is to filter out '(c)' used as a list item inside C++ comments.
(...skipping 15 matching lines...) Expand all
124 self.last_b_item_line_number) and \ 132 self.last_b_item_line_number) and \
125 self._CloseLineNumbers(self.last_b_item_line_number, 133 self._CloseLineNumbers(self.last_b_item_line_number,
126 self.last_a_item_line_number): 134 self.last_a_item_line_number):
127 return None 135 return None
128 copyr = None 136 copyr = None
129 m = _CopyrightsScanner._full_copyright_indicator_re.search(line) 137 m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
130 if m and \ 138 if m and \
131 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)): 139 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
132 copyr = m.group(0) 140 copyr = m.group(0)
133 # Prettify the authorship string. 141 # Prettify the authorship string.
134 copyr = re.sub(r'([,.])?\s*$/', '', copyr) 142 copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)
135 copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE) 143 copyr = self.re.sub(
136 copyr = re.sub(r'^\s+', '', copyr) 144 _CopyrightsScanner._copyright_indicator, '', copyr, \
137 copyr = re.sub(r'\s{2,}', ' ', copyr) 145 flags=self.re.IGNORECASE)
138 copyr = re.sub(r'\\@', '@', copyr) 146 copyr = self.re.sub(r'^\s+', '', copyr)
147 copyr = self.re.sub(r'\s{2,}', ' ', copyr)
148 copyr = self.re.sub(r'\\@', '@', copyr)
139 return copyr 149 return copyr
140 150
141 151
142 def FindCopyrights(root_dir, files_to_scan): 152 def FindCopyrights(input_api, root_dir, files_to_scan):
143 """Determines code autorship, and finds generated files. 153 """Determines code autorship, and finds generated files.
144 Args: 154 Args:
155 input_api: InputAPI, as in presubmit scripts.
145 root_dir: The root directory, to which all other paths are relative. 156 root_dir: The root directory, to which all other paths are relative.
146 files_to_scan: The list of file names to scan. 157 files_to_scan: The list of file names to scan.
147 Returns: 158 Returns:
148 The list of copyrights associated with each of the files given. 159 The list of copyrights associated with each of the files given.
149 If the certain file is generated, the corresponding list consists a single 160 If the certain file is generated, the corresponding list consists a single
150 entry -- 'GENERATED_FILE' string. If the file has no copyright info, 161 entry -- 'GENERATED_FILE' string. If the file has no copyright info,
151 the corresponding list contains 'NO_COPYRIGHT' string. 162 the corresponding list contains 'NO_COPYRIGHT' string.
152 """ 163 """
164 generated_files_detector = _GeneratedFilesDetector(input_api)
165 _CopyrightsScanner.StaticInit(input_api)
153 copyrights = [] 166 copyrights = []
154 for file_name in files_to_scan: 167 for file_name in files_to_scan:
155 linenum = 0 168 linenum = 0
156 header = '' 169 header = []
157 file_copyrights = [] 170 file_copyrights = []
158 scanner = _CopyrightsScanner() 171 scanner = _CopyrightsScanner(input_api)
159 with open(os.path.join(root_dir, file_name), 'r') as f: 172 contents = input_api.ReadFile(
160 for l in f.readlines(): 173 input_api.os_path.join(root_dir, file_name), 'r')
161 linenum += 1 174 for l in contents.split('\n'):
162 if linenum <= 25: 175 linenum += 1
163 header += l 176 if linenum <= 25:
164 c = scanner.MatchLine(linenum, l) 177 header.append(l)
165 if c: 178 c = scanner.MatchLine(linenum, l)
166 file_copyrights.append(c) 179 if c:
167 if _IsGeneratedFile(header): 180 file_copyrights.append(c)
168 copyrights.append([GENERATED_FILE]) 181 if generated_files_detector.IsGeneratedFile('\n'.join(header)):
169 elif file_copyrights: 182 copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])
170 copyrights.append(file_copyrights) 183 elif file_copyrights:
171 else: 184 copyrights.append(file_copyrights)
172 copyrights.append([NO_COPYRIGHT]) 185 else:
186 copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])
173 return copyrights 187 return copyrights
174 188
175 189
176 def FindCopyrightViolations(root_dir, files_to_scan): 190 def FindCopyrightViolations(input_api, root_dir, files_to_scan):
177 """Looks for files that are not belong exlusively to the Chromium Authors. 191 """Looks for files that are not belong exlusively to the Chromium Authors.
178 Args: 192 Args:
193 input_api: InputAPI, as in presubmit scripts.
179 root_dir: The root directory, to which all other paths are relative. 194 root_dir: The root directory, to which all other paths are relative.
180 files_to_scan: The list of file names to scan. 195 files_to_scan: The list of file names to scan.
181 Returns: 196 Returns:
182 The list of file names that contain non-Chromium copyrights. 197 The list of file names that contain non-Chromium copyrights.
183 """ 198 """
184 copyrights = FindCopyrights(root_dir, files_to_scan) 199 copyrights = FindCopyrights(input_api, root_dir, files_to_scan)
185 offending_files = [] 200 offending_files = []
186 allowed_copyrights_re = re.compile( 201 allowed_copyrights_re = input_api.re.compile(
187 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. ' 202 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
188 'All rights reserved.*)$') 203 'All rights reserved.*)$')
189 for f, cs in itertools.izip(files_to_scan, copyrights): 204 for f, cs in itertools.izip(files_to_scan, copyrights):
190 if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT: 205 if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \
206 cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:
191 continue 207 continue
192 for c in cs: 208 for c in cs:
193 if not allowed_copyrights_re.match(c): 209 if not allowed_copyrights_re.match(c):
194 offending_files.append(os.path.normpath(f)) 210 offending_files.append(input_api.os_path.normpath(f))
195 break 211 break
196 return offending_files 212 return offending_files
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698