Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(109)

Side by Side Diff: android_webview/tools/copyright_scanner.py

Issue 667723002: [Android WebView] Prepare the copyrights scanner to run from presubmit scripts (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Add a removed empty line Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright 2014 The Chromium Authors. All rights reserved. 1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 """Utilities for scanning source files to determine code authorship. 5 """Utilities for scanning source files to determine code authorship.
6 """ 6 """
7 7
8 import itertools 8 import itertools
9 import os
10 import re
11 9
12 10
13 def FindFiles(root_dir, start_paths_list, excluded_dirs_list): 11 def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):
14 """Similar to UNIX utility find(1), searches for files in the directories. 12 """Similar to UNIX utility find(1), searches for files in the directories.
15 Automatically leaves out only source code files. 13 Automatically leaves out only source code files.
16 Args: 14 Args:
15 input_api: InputAPI, as in presubmit scripts.
17 root_dir: The root directory, to which all other paths are relative. 16 root_dir: The root directory, to which all other paths are relative.
18 start_paths_list: The list of paths to start search from. Each path can 17 start_paths_list: The list of paths to start search from. Each path can
19 be a file or a directory. 18 be a file or a directory.
20 excluded_dirs_list: The list of directories to skip. 19 excluded_dirs_list: The list of directories to skip.
21 Returns: 20 Returns:
22 The list of source code files found, relative to |root_dir|. 21 The list of source code files found, relative to |root_dir|.
23 """ 22 """
24 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list] 23 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]
25 def IsBlacklistedDir(d): 24 def IsBlacklistedDir(d):
26 for item in dirs_blacklist: 25 for item in dirs_blacklist:
27 if item in d: 26 if item in d:
28 return True 27 return True
29 return False 28 return False
30 29
31 files_whitelist_re = re.compile( 30 files_whitelist_re = input_api.re.compile(
32 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' 31 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
33 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' 32 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
34 '|tex|mli?)$') 33 '|tex|mli?)$')
35 files = [] 34 files = []
36 35
37 base_path_len = len(root_dir) 36 base_path_len = len(root_dir)
38 for path in start_paths_list: 37 for path in start_paths_list:
39 full_path = os.path.join(root_dir, path) 38 full_path = input_api.os_path.join(root_dir, path)
40 if os.path.isfile(full_path): 39 if input_api.os_path.isfile(full_path):
41 if files_whitelist_re.search(path): 40 if files_whitelist_re.search(path):
42 files.append(path) 41 files.append(path)
43 else: 42 else:
44 for dirpath, dirnames, filenames in os.walk(full_path): 43 for dirpath, dirnames, filenames in input_api.os_walk(full_path):
45 # Remove excluded subdirs for faster scanning. 44 # Remove excluded subdirs for faster scanning.
46 for item in dirnames[:]: 45 for item in dirnames[:]:
47 if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]): 46 if IsBlacklistedDir(
47 input_api.os_path.join(dirpath, item)[base_path_len + 1:]):
48 dirnames.remove(item) 48 dirnames.remove(item)
49 for filename in filenames: 49 for filename in filenames:
50 filepath = os.path.join(dirpath, filename)[base_path_len + 1:] 50 filepath = \
51 input_api.os_path.join(dirpath, filename)[base_path_len + 1:]
51 if files_whitelist_re.search(filepath) and \ 52 if files_whitelist_re.search(filepath) and \
52 not IsBlacklistedDir(filepath): 53 not IsBlacklistedDir(filepath):
53 files.append(filepath) 54 files.append(filepath)
54 return files 55 return files
55 56
56 57
57 python_multiline_string_double_re = re.compile( 58 class _GeneratedFilesDetector(object):
58 r'"""[^"]*(?:"""|$)', flags=re.MULTILINE) 59 GENERATED_FILE = 'GENERATED FILE'
59 python_multiline_string_single_re = re.compile( 60 NO_COPYRIGHT = '*No copyright*'
60 r"'''[^']*(?:'''|$)", flags=re.MULTILINE)
61 automatically_generated_re = re.compile(
62 r'(All changes made in this file will be lost'
63 '|DO NOT (EDIT|delete this file)'
64 '|Generated (at|automatically|data)'
65 '|Automatically generated'
66 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE)
67 61
68 def _IsGeneratedFile(header): 62 @staticmethod
69 header = header.upper() 63 def StaticInit(input_api):
70 if '"""' in header: 64 _GeneratedFilesDetector.python_multiline_string_double_re = \
mkosiba (inactive) 2014/10/21 15:41:22 wouldn't it be simpler to have these be instance m
mnaganov (inactive) 2014/10/22 09:27:29 Done. But it's different for _CopyrightsScanner,
71 header = python_multiline_string_double_re.sub('', header) 65 input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE)
72 if "'''" in header: 66 _GeneratedFilesDetector.python_multiline_string_single_re = \
73 header = python_multiline_string_single_re.sub('', header) 67 input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE)
74 # First do simple strings lookup to save time. 68 _GeneratedFilesDetector.automatically_generated_re = input_api.re.compile(
75 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header: 69 r'(All changes made in this file will be lost'
76 return True 70 '|DO NOT (EDIT|delete this file)'
77 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \ 71 '|Generated (at|automatically|data)'
78 'GENERATED' in header: 72 '|Automatically generated'
79 return automatically_generated_re.search(header) 73 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)
80 return False 74
75 @staticmethod
76 def _IsGeneratedFile(header):
77 header = header.upper()
78 if '"""' in header:
79 header = _GeneratedFilesDetector.python_multiline_string_double_re.sub(
80 '', header)
81 if "'''" in header:
82 header = _GeneratedFilesDetector.python_multiline_string_single_re.sub(
83 '', header)
84 # First do simple strings lookup to save time.
85 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
86 return True
87 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
88 'GENERATED' in header:
89 return _GeneratedFilesDetector.automatically_generated_re.search(header)
90 return False
81 91
82 92
83 GENERATED_FILE = 'GENERATED FILE' 93 class _CopyrightsScanner(object):
84 NO_COPYRIGHT = '*No copyright*' 94 @staticmethod
95 def StaticInit(input_api):
96 _CopyrightsScanner._c_comment_re = \
97 input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
98 _CopyrightsScanner._copyright_indicator = \
99 r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
100 _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(
101 r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \
102 r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE)
103 _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(
104 r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE)
85 105
86 class _CopyrightsScanner(object): 106 def __init__(self, input_api):
87 _c_comment_re = re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
88 _copyright_indicator = r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
89 _full_copyright_indicator_re = \
90 re.compile(r'(?:\W|^)' + _copyright_indicator + r'(?::\s*|\s+)(\w.*)$', \
91 re.IGNORECASE)
92 _copyright_disindicator_re = \
93 re.compile(r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', re.IGNORECASE)
94
95 def __init__(self):
96 self.max_line_numbers_proximity = 3 107 self.max_line_numbers_proximity = 3
97 self.last_a_item_line_number = -200 108 self.last_a_item_line_number = -200
98 self.last_b_item_line_number = -100 109 self.last_b_item_line_number = -100
110 self.re = input_api.re
99 111
100 def _CloseLineNumbers(self, a, b): 112 def _CloseLineNumbers(self, a, b):
101 return 0 <= a - b <= self.max_line_numbers_proximity 113 return 0 <= a - b <= self.max_line_numbers_proximity
102 114
103 def MatchLine(self, line_number, line): 115 def MatchLine(self, line_number, line):
104 if '"' in line: 116 if '"' in line:
105 line = _CopyrightsScanner._c_comment_re.sub('', line) 117 line = _CopyrightsScanner._c_comment_re.sub('', line)
106 upcase_line = line.upper() 118 upcase_line = line.upper()
107 # Record '(a)' and '(b)' last occurences in C++ comments. 119 # Record '(a)' and '(b)' last occurences in C++ comments.
108 # This is to filter out '(c)' used as a list item inside C++ comments. 120 # This is to filter out '(c)' used as a list item inside C++ comments.
(...skipping 15 matching lines...) Expand all
124 self.last_b_item_line_number) and \ 136 self.last_b_item_line_number) and \
125 self._CloseLineNumbers(self.last_b_item_line_number, 137 self._CloseLineNumbers(self.last_b_item_line_number,
126 self.last_a_item_line_number): 138 self.last_a_item_line_number):
127 return None 139 return None
128 copyr = None 140 copyr = None
129 m = _CopyrightsScanner._full_copyright_indicator_re.search(line) 141 m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
130 if m and \ 142 if m and \
131 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)): 143 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
132 copyr = m.group(0) 144 copyr = m.group(0)
133 # Prettify the authorship string. 145 # Prettify the authorship string.
134 copyr = re.sub(r'([,.])?\s*$/', '', copyr) 146 copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)
135 copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE) 147 copyr = self.re.sub(
136 copyr = re.sub(r'^\s+', '', copyr) 148 _CopyrightsScanner._copyright_indicator, '', copyr, \
137 copyr = re.sub(r'\s{2,}', ' ', copyr) 149 flags=self.re.IGNORECASE)
138 copyr = re.sub(r'\\@', '@', copyr) 150 copyr = self.re.sub(r'^\s+', '', copyr)
151 copyr = self.re.sub(r'\s{2,}', ' ', copyr)
152 copyr = self.re.sub(r'\\@', '@', copyr)
139 return copyr 153 return copyr
140 154
141 155
142 def FindCopyrights(root_dir, files_to_scan): 156 def FindCopyrights(input_api, root_dir, files_to_scan):
143 """Determines code autorship, and finds generated files. 157 """Determines code autorship, and finds generated files.
144 Args: 158 Args:
159 input_api: InputAPI, as in presubmit scripts.
145 root_dir: The root directory, to which all other paths are relative. 160 root_dir: The root directory, to which all other paths are relative.
146 files_to_scan: The list of file names to scan. 161 files_to_scan: The list of file names to scan.
147 Returns: 162 Returns:
148 The list of copyrights associated with each of the files given. 163 The list of copyrights associated with each of the files given.
149 If the certain file is generated, the corresponding list consists a single 164 If the certain file is generated, the corresponding list consists a single
150 entry -- 'GENERATED_FILE' string. If the file has no copyright info, 165 entry -- 'GENERATED_FILE' string. If the file has no copyright info,
151 the corresponding list contains 'NO_COPYRIGHT' string. 166 the corresponding list contains 'NO_COPYRIGHT' string.
152 """ 167 """
168 _GeneratedFilesDetector.StaticInit(input_api)
169 _CopyrightsScanner.StaticInit(input_api)
153 copyrights = [] 170 copyrights = []
154 for file_name in files_to_scan: 171 for file_name in files_to_scan:
155 linenum = 0 172 linenum = 0
156 header = '' 173 header = []
157 file_copyrights = [] 174 file_copyrights = []
158 scanner = _CopyrightsScanner() 175 scanner = _CopyrightsScanner(input_api)
159 with open(os.path.join(root_dir, file_name), 'r') as f: 176 contents = input_api.ReadFile(
160 for l in f.readlines(): 177 input_api.os_path.join(root_dir, file_name), 'rb')
mkosiba (inactive) 2014/10/21 15:41:22 is 'rb' intentional?
mnaganov (inactive) 2014/10/22 09:27:29 Not sure :) Let's stick with 'r', as before.
161 linenum += 1 178 for l in contents.split('\n'):
162 if linenum <= 25: 179 linenum += 1
163 header += l 180 if linenum <= 25:
164 c = scanner.MatchLine(linenum, l) 181 header.append(l)
165 if c: 182 c = scanner.MatchLine(linenum, l)
166 file_copyrights.append(c) 183 if c:
167 if _IsGeneratedFile(header): 184 file_copyrights.append(c)
168 copyrights.append([GENERATED_FILE]) 185 if _GeneratedFilesDetector._IsGeneratedFile('\n'.join(header)):
169 elif file_copyrights: 186 copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])
170 copyrights.append(file_copyrights) 187 elif file_copyrights:
171 else: 188 copyrights.append(file_copyrights)
172 copyrights.append([NO_COPYRIGHT]) 189 else:
190 copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])
173 return copyrights 191 return copyrights
174 192
175 193
176 def FindCopyrightViolations(root_dir, files_to_scan): 194 def FindCopyrightViolations(input_api, root_dir, files_to_scan):
177 """Looks for files that are not belong exlusively to the Chromium Authors. 195 """Looks for files that are not belong exlusively to the Chromium Authors.
178 Args: 196 Args:
197 input_api: InputAPI, as in presubmit scripts.
179 root_dir: The root directory, to which all other paths are relative. 198 root_dir: The root directory, to which all other paths are relative.
180 files_to_scan: The list of file names to scan. 199 files_to_scan: The list of file names to scan.
181 Returns: 200 Returns:
182 The list of file names that contain non-Chromium copyrights. 201 The list of file names that contain non-Chromium copyrights.
183 """ 202 """
184 copyrights = FindCopyrights(root_dir, files_to_scan) 203 copyrights = FindCopyrights(input_api, root_dir, files_to_scan)
185 offending_files = [] 204 offending_files = []
186 allowed_copyrights_re = re.compile( 205 allowed_copyrights_re = input_api.re.compile(
187 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. ' 206 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
188 'All rights reserved.*)$') 207 'All rights reserved.*)$')
189 for f, cs in itertools.izip(files_to_scan, copyrights): 208 for f, cs in itertools.izip(files_to_scan, copyrights):
190 if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT: 209 if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \
210 cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:
191 continue 211 continue
192 for c in cs: 212 for c in cs:
193 if not allowed_copyrights_re.match(c): 213 if not allowed_copyrights_re.match(c):
194 offending_files.append(os.path.normpath(f)) 214 offending_files.append(input_api.os_path.normpath(f))
195 break 215 break
196 return offending_files 216 return offending_files
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698