OLD | NEW |
---|---|
1 # Copyright 2014 The Chromium Authors. All rights reserved. | 1 # Copyright 2014 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 """Utilities for scanning source files to determine code authorship. | 5 """Utilities for scanning source files to determine code authorship. |
6 """ | 6 """ |
7 | 7 |
8 import itertools | 8 import itertools |
9 import os | |
10 import re | |
11 | 9 |
12 | 10 |
13 def FindFiles(root_dir, start_paths_list, excluded_dirs_list): | 11 def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list): |
14 """Similar to UNIX utility find(1), searches for files in the directories. | 12 """Similar to UNIX utility find(1), searches for files in the directories. |
15 Automatically leaves out only source code files. | 13 Automatically leaves out only source code files. |
16 Args: | 14 Args: |
15 input_api: InputAPI, as in presubmit scripts. | |
17 root_dir: The root directory, to which all other paths are relative. | 16 root_dir: The root directory, to which all other paths are relative. |
18 start_paths_list: The list of paths to start search from. Each path can | 17 start_paths_list: The list of paths to start search from. Each path can |
19 be a file or a directory. | 18 be a file or a directory. |
20 excluded_dirs_list: The list of directories to skip. | 19 excluded_dirs_list: The list of directories to skip. |
21 Returns: | 20 Returns: |
22 The list of source code files found, relative to |root_dir|. | 21 The list of source code files found, relative to |root_dir|. |
23 """ | 22 """ |
24 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list] | 23 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list] |
25 def IsBlacklistedDir(d): | 24 def IsBlacklistedDir(d): |
26 for item in dirs_blacklist: | 25 for item in dirs_blacklist: |
27 if item in d: | 26 if item in d: |
28 return True | 27 return True |
29 return False | 28 return False |
30 | 29 |
31 files_whitelist_re = re.compile( | 30 files_whitelist_re = input_api.re.compile( |
32 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' | 31 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' |
33 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' | 32 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' |
34 '|tex|mli?)$') | 33 '|tex|mli?)$') |
35 files = [] | 34 files = [] |
36 | 35 |
37 base_path_len = len(root_dir) | 36 base_path_len = len(root_dir) |
38 for path in start_paths_list: | 37 for path in start_paths_list: |
39 full_path = os.path.join(root_dir, path) | 38 full_path = input_api.os_path.join(root_dir, path) |
40 if os.path.isfile(full_path): | 39 if input_api.os_path.isfile(full_path): |
41 if files_whitelist_re.search(path): | 40 if files_whitelist_re.search(path): |
42 files.append(path) | 41 files.append(path) |
43 else: | 42 else: |
44 for dirpath, dirnames, filenames in os.walk(full_path): | 43 for dirpath, dirnames, filenames in input_api.os_walk(full_path): |
45 # Remove excluded subdirs for faster scanning. | 44 # Remove excluded subdirs for faster scanning. |
46 for item in dirnames[:]: | 45 for item in dirnames[:]: |
47 if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]): | 46 if IsBlacklistedDir( |
47 input_api.os_path.join(dirpath, item)[base_path_len + 1:]): | |
48 dirnames.remove(item) | 48 dirnames.remove(item) |
49 for filename in filenames: | 49 for filename in filenames: |
50 filepath = os.path.join(dirpath, filename)[base_path_len + 1:] | 50 filepath = \ |
51 input_api.os_path.join(dirpath, filename)[base_path_len + 1:] | |
51 if files_whitelist_re.search(filepath) and \ | 52 if files_whitelist_re.search(filepath) and \ |
52 not IsBlacklistedDir(filepath): | 53 not IsBlacklistedDir(filepath): |
53 files.append(filepath) | 54 files.append(filepath) |
54 return files | 55 return files |
55 | 56 |
56 | 57 |
57 python_multiline_string_double_re = re.compile( | 58 class _GeneratedFilesDetector(object): |
58 r'"""[^"]*(?:"""|$)', flags=re.MULTILINE) | 59 GENERATED_FILE = 'GENERATED FILE' |
59 python_multiline_string_single_re = re.compile( | 60 NO_COPYRIGHT = '*No copyright*' |
60 r"'''[^']*(?:'''|$)", flags=re.MULTILINE) | |
61 automatically_generated_re = re.compile( | |
62 r'(All changes made in this file will be lost' | |
63 '|DO NOT (EDIT|delete this file)' | |
64 '|Generated (at|automatically|data)' | |
65 '|Automatically generated' | |
66 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE) | |
67 | 61 |
68 def _IsGeneratedFile(header): | 62 @staticmethod |
69 header = header.upper() | 63 def StaticInit(input_api): |
70 if '"""' in header: | 64 _GeneratedFilesDetector.python_multiline_string_double_re = \ |
mkosiba (inactive)
2014/10/21 15:41:22
wouldn't it be simpler to have these be instance m
mnaganov (inactive)
2014/10/22 09:27:29
Done.
But it's different for _CopyrightsScanner,
| |
71 header = python_multiline_string_double_re.sub('', header) | 65 input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE) |
72 if "'''" in header: | 66 _GeneratedFilesDetector.python_multiline_string_single_re = \ |
73 header = python_multiline_string_single_re.sub('', header) | 67 input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE) |
74 # First do simple strings lookup to save time. | 68 _GeneratedFilesDetector.automatically_generated_re = input_api.re.compile( |
75 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header: | 69 r'(All changes made in this file will be lost' |
76 return True | 70 '|DO NOT (EDIT|delete this file)' |
77 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \ | 71 '|Generated (at|automatically|data)' |
78 'GENERATED' in header: | 72 '|Automatically generated' |
79 return automatically_generated_re.search(header) | 73 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE) |
80 return False | 74 |
75 @staticmethod | |
76 def _IsGeneratedFile(header): | |
77 header = header.upper() | |
78 if '"""' in header: | |
79 header = _GeneratedFilesDetector.python_multiline_string_double_re.sub( | |
80 '', header) | |
81 if "'''" in header: | |
82 header = _GeneratedFilesDetector.python_multiline_string_single_re.sub( | |
83 '', header) | |
84 # First do simple strings lookup to save time. | |
85 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header: | |
86 return True | |
87 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \ | |
88 'GENERATED' in header: | |
89 return _GeneratedFilesDetector.automatically_generated_re.search(header) | |
90 return False | |
81 | 91 |
82 | 92 |
83 GENERATED_FILE = 'GENERATED FILE' | 93 class _CopyrightsScanner(object): |
84 NO_COPYRIGHT = '*No copyright*' | 94 @staticmethod |
95 def StaticInit(input_api): | |
96 _CopyrightsScanner._c_comment_re = \ | |
97 input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''') | |
98 _CopyrightsScanner._copyright_indicator = \ | |
99 r'(?:copyright|copr\.|\xc2\xa9|\(c\))' | |
100 _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile( | |
101 r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \ | |
102 r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE) | |
103 _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile( | |
104 r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE) | |
85 | 105 |
86 class _CopyrightsScanner(object): | 106 def __init__(self, input_api): |
87 _c_comment_re = re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''') | |
88 _copyright_indicator = r'(?:copyright|copr\.|\xc2\xa9|\(c\))' | |
89 _full_copyright_indicator_re = \ | |
90 re.compile(r'(?:\W|^)' + _copyright_indicator + r'(?::\s*|\s+)(\w.*)$', \ | |
91 re.IGNORECASE) | |
92 _copyright_disindicator_re = \ | |
93 re.compile(r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', re.IGNORECASE) | |
94 | |
95 def __init__(self): | |
96 self.max_line_numbers_proximity = 3 | 107 self.max_line_numbers_proximity = 3 |
97 self.last_a_item_line_number = -200 | 108 self.last_a_item_line_number = -200 |
98 self.last_b_item_line_number = -100 | 109 self.last_b_item_line_number = -100 |
110 self.re = input_api.re | |
99 | 111 |
100 def _CloseLineNumbers(self, a, b): | 112 def _CloseLineNumbers(self, a, b): |
101 return 0 <= a - b <= self.max_line_numbers_proximity | 113 return 0 <= a - b <= self.max_line_numbers_proximity |
102 | 114 |
103 def MatchLine(self, line_number, line): | 115 def MatchLine(self, line_number, line): |
104 if '"' in line: | 116 if '"' in line: |
105 line = _CopyrightsScanner._c_comment_re.sub('', line) | 117 line = _CopyrightsScanner._c_comment_re.sub('', line) |
106 upcase_line = line.upper() | 118 upcase_line = line.upper() |
107 # Record '(a)' and '(b)' last occurences in C++ comments. | 119 # Record '(a)' and '(b)' last occurences in C++ comments. |
108 # This is to filter out '(c)' used as a list item inside C++ comments. | 120 # This is to filter out '(c)' used as a list item inside C++ comments. |
(...skipping 15 matching lines...) Expand all Loading... | |
124 self.last_b_item_line_number) and \ | 136 self.last_b_item_line_number) and \ |
125 self._CloseLineNumbers(self.last_b_item_line_number, | 137 self._CloseLineNumbers(self.last_b_item_line_number, |
126 self.last_a_item_line_number): | 138 self.last_a_item_line_number): |
127 return None | 139 return None |
128 copyr = None | 140 copyr = None |
129 m = _CopyrightsScanner._full_copyright_indicator_re.search(line) | 141 m = _CopyrightsScanner._full_copyright_indicator_re.search(line) |
130 if m and \ | 142 if m and \ |
131 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)): | 143 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)): |
132 copyr = m.group(0) | 144 copyr = m.group(0) |
133 # Prettify the authorship string. | 145 # Prettify the authorship string. |
134 copyr = re.sub(r'([,.])?\s*$/', '', copyr) | 146 copyr = self.re.sub(r'([,.])?\s*$/', '', copyr) |
135 copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE) | 147 copyr = self.re.sub( |
136 copyr = re.sub(r'^\s+', '', copyr) | 148 _CopyrightsScanner._copyright_indicator, '', copyr, \ |
137 copyr = re.sub(r'\s{2,}', ' ', copyr) | 149 flags=self.re.IGNORECASE) |
138 copyr = re.sub(r'\\@', '@', copyr) | 150 copyr = self.re.sub(r'^\s+', '', copyr) |
151 copyr = self.re.sub(r'\s{2,}', ' ', copyr) | |
152 copyr = self.re.sub(r'\\@', '@', copyr) | |
139 return copyr | 153 return copyr |
140 | 154 |
141 | 155 |
142 def FindCopyrights(root_dir, files_to_scan): | 156 def FindCopyrights(input_api, root_dir, files_to_scan): |
143 """Determines code autorship, and finds generated files. | 157 """Determines code autorship, and finds generated files. |
144 Args: | 158 Args: |
159 input_api: InputAPI, as in presubmit scripts. | |
145 root_dir: The root directory, to which all other paths are relative. | 160 root_dir: The root directory, to which all other paths are relative. |
146 files_to_scan: The list of file names to scan. | 161 files_to_scan: The list of file names to scan. |
147 Returns: | 162 Returns: |
148 The list of copyrights associated with each of the files given. | 163 The list of copyrights associated with each of the files given. |
149 If the certain file is generated, the corresponding list consists a single | 164 If the certain file is generated, the corresponding list consists a single |
150 entry -- 'GENERATED_FILE' string. If the file has no copyright info, | 165 entry -- 'GENERATED_FILE' string. If the file has no copyright info, |
151 the corresponding list contains 'NO_COPYRIGHT' string. | 166 the corresponding list contains 'NO_COPYRIGHT' string. |
152 """ | 167 """ |
168 _GeneratedFilesDetector.StaticInit(input_api) | |
169 _CopyrightsScanner.StaticInit(input_api) | |
153 copyrights = [] | 170 copyrights = [] |
154 for file_name in files_to_scan: | 171 for file_name in files_to_scan: |
155 linenum = 0 | 172 linenum = 0 |
156 header = '' | 173 header = [] |
157 file_copyrights = [] | 174 file_copyrights = [] |
158 scanner = _CopyrightsScanner() | 175 scanner = _CopyrightsScanner(input_api) |
159 with open(os.path.join(root_dir, file_name), 'r') as f: | 176 contents = input_api.ReadFile( |
160 for l in f.readlines(): | 177 input_api.os_path.join(root_dir, file_name), 'rb') |
mkosiba (inactive)
2014/10/21 15:41:22
is 'rb' intentional?
mnaganov (inactive)
2014/10/22 09:27:29
Not sure :) Let's stick with 'r', as before.
| |
161 linenum += 1 | 178 for l in contents.split('\n'): |
162 if linenum <= 25: | 179 linenum += 1 |
163 header += l | 180 if linenum <= 25: |
164 c = scanner.MatchLine(linenum, l) | 181 header.append(l) |
165 if c: | 182 c = scanner.MatchLine(linenum, l) |
166 file_copyrights.append(c) | 183 if c: |
167 if _IsGeneratedFile(header): | 184 file_copyrights.append(c) |
168 copyrights.append([GENERATED_FILE]) | 185 if _GeneratedFilesDetector._IsGeneratedFile('\n'.join(header)): |
169 elif file_copyrights: | 186 copyrights.append([_GeneratedFilesDetector.GENERATED_FILE]) |
170 copyrights.append(file_copyrights) | 187 elif file_copyrights: |
171 else: | 188 copyrights.append(file_copyrights) |
172 copyrights.append([NO_COPYRIGHT]) | 189 else: |
190 copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT]) | |
173 return copyrights | 191 return copyrights |
174 | 192 |
175 | 193 |
176 def FindCopyrightViolations(root_dir, files_to_scan): | 194 def FindCopyrightViolations(input_api, root_dir, files_to_scan): |
177 """Looks for files that are not belong exlusively to the Chromium Authors. | 195 """Looks for files that are not belong exlusively to the Chromium Authors. |
178 Args: | 196 Args: |
197 input_api: InputAPI, as in presubmit scripts. | |
179 root_dir: The root directory, to which all other paths are relative. | 198 root_dir: The root directory, to which all other paths are relative. |
180 files_to_scan: The list of file names to scan. | 199 files_to_scan: The list of file names to scan. |
181 Returns: | 200 Returns: |
182 The list of file names that contain non-Chromium copyrights. | 201 The list of file names that contain non-Chromium copyrights. |
183 """ | 202 """ |
184 copyrights = FindCopyrights(root_dir, files_to_scan) | 203 copyrights = FindCopyrights(input_api, root_dir, files_to_scan) |
185 offending_files = [] | 204 offending_files = [] |
186 allowed_copyrights_re = re.compile( | 205 allowed_copyrights_re = input_api.re.compile( |
187 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. ' | 206 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. ' |
188 'All rights reserved.*)$') | 207 'All rights reserved.*)$') |
189 for f, cs in itertools.izip(files_to_scan, copyrights): | 208 for f, cs in itertools.izip(files_to_scan, copyrights): |
190 if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT: | 209 if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \ |
210 cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT: | |
191 continue | 211 continue |
192 for c in cs: | 212 for c in cs: |
193 if not allowed_copyrights_re.match(c): | 213 if not allowed_copyrights_re.match(c): |
194 offending_files.append(os.path.normpath(f)) | 214 offending_files.append(input_api.os_path.normpath(f)) |
195 break | 215 break |
196 return offending_files | 216 return offending_files |
OLD | NEW |