OLD | NEW |
1 # Copyright 2014 The Chromium Authors. All rights reserved. | 1 # Copyright 2014 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 """Utilities for scanning source files to determine code authorship. | 5 """Utilities for scanning source files to determine code authorship. |
6 """ | 6 """ |
7 | 7 |
8 import itertools | 8 import itertools |
9 import os | |
10 import re | |
11 | 9 |
12 | 10 |
13 def FindFiles(root_dir, start_paths_list, excluded_dirs_list): | 11 def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list): |
14 """Similar to UNIX utility find(1), searches for files in the directories. | 12 """Similar to UNIX utility find(1), searches for files in the directories. |
15 Automatically leaves out only source code files. | 13 Automatically leaves out only source code files. |
16 Args: | 14 Args: |
| 15 input_api: InputAPI, as in presubmit scripts. |
17 root_dir: The root directory, to which all other paths are relative. | 16 root_dir: The root directory, to which all other paths are relative. |
18 start_paths_list: The list of paths to start search from. Each path can | 17 start_paths_list: The list of paths to start search from. Each path can |
19 be a file or a directory. | 18 be a file or a directory. |
20 excluded_dirs_list: The list of directories to skip. | 19 excluded_dirs_list: The list of directories to skip. |
21 Returns: | 20 Returns: |
22 The list of source code files found, relative to |root_dir|. | 21 The list of source code files found, relative to |root_dir|. |
23 """ | 22 """ |
24 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list] | 23 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list] |
25 def IsBlacklistedDir(d): | 24 def IsBlacklistedDir(d): |
26 for item in dirs_blacklist: | 25 for item in dirs_blacklist: |
27 if item in d: | 26 if item in d: |
28 return True | 27 return True |
29 return False | 28 return False |
30 | 29 |
31 files_whitelist_re = re.compile( | 30 files_whitelist_re = input_api.re.compile( |
32 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' | 31 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' |
33 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' | 32 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' |
34 '|tex|mli?)$') | 33 '|tex|mli?)$') |
35 files = [] | 34 files = [] |
36 | 35 |
37 base_path_len = len(root_dir) | 36 base_path_len = len(root_dir) |
38 for path in start_paths_list: | 37 for path in start_paths_list: |
39 full_path = os.path.join(root_dir, path) | 38 full_path = input_api.os_path.join(root_dir, path) |
40 if os.path.isfile(full_path): | 39 if input_api.os_path.isfile(full_path): |
41 if files_whitelist_re.search(path): | 40 if files_whitelist_re.search(path): |
42 files.append(path) | 41 files.append(path) |
43 else: | 42 else: |
44 for dirpath, dirnames, filenames in os.walk(full_path): | 43 for dirpath, dirnames, filenames in input_api.os_walk(full_path): |
45 # Remove excluded subdirs for faster scanning. | 44 # Remove excluded subdirs for faster scanning. |
46 for item in dirnames[:]: | 45 for item in dirnames[:]: |
47 if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]): | 46 if IsBlacklistedDir( |
| 47 input_api.os_path.join(dirpath, item)[base_path_len + 1:]): |
48 dirnames.remove(item) | 48 dirnames.remove(item) |
49 for filename in filenames: | 49 for filename in filenames: |
50 filepath = os.path.join(dirpath, filename)[base_path_len + 1:] | 50 filepath = \ |
| 51 input_api.os_path.join(dirpath, filename)[base_path_len + 1:] |
51 if files_whitelist_re.search(filepath) and \ | 52 if files_whitelist_re.search(filepath) and \ |
52 not IsBlacklistedDir(filepath): | 53 not IsBlacklistedDir(filepath): |
53 files.append(filepath) | 54 files.append(filepath) |
54 return files | 55 return files |
55 | 56 |
56 | 57 |
57 python_multiline_string_double_re = re.compile( | 58 class _GeneratedFilesDetector(object): |
58 r'"""[^"]*(?:"""|$)', flags=re.MULTILINE) | 59 GENERATED_FILE = 'GENERATED FILE' |
59 python_multiline_string_single_re = re.compile( | 60 NO_COPYRIGHT = '*No copyright*' |
60 r"'''[^']*(?:'''|$)", flags=re.MULTILINE) | |
61 automatically_generated_re = re.compile( | |
62 r'(All changes made in this file will be lost' | |
63 '|DO NOT (EDIT|delete this file)' | |
64 '|Generated (at|automatically|data)' | |
65 '|Automatically generated' | |
66 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE) | |
67 | 61 |
68 def _IsGeneratedFile(header): | 62 def __init__(self, input_api): |
69 header = header.upper() | 63 self.python_multiline_string_double_re = \ |
70 if '"""' in header: | 64 input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE) |
71 header = python_multiline_string_double_re.sub('', header) | 65 self.python_multiline_string_single_re = \ |
72 if "'''" in header: | 66 input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE) |
73 header = python_multiline_string_single_re.sub('', header) | 67 self.automatically_generated_re = input_api.re.compile( |
74 # First do simple strings lookup to save time. | 68 r'(All changes made in this file will be lost' |
75 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header: | 69 '|DO NOT (EDIT|delete this file)' |
76 return True | 70 '|Generated (at|automatically|data)' |
77 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \ | 71 '|Automatically generated' |
78 'GENERATED' in header: | 72 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE) |
79 return automatically_generated_re.search(header) | 73 |
80 return False | 74 def IsGeneratedFile(self, header): |
| 75 header = header.upper() |
| 76 if '"""' in header: |
| 77 header = self.python_multiline_string_double_re.sub('', header) |
| 78 if "'''" in header: |
| 79 header = self.python_multiline_string_single_re.sub('', header) |
| 80 # First do simple strings lookup to save time. |
| 81 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header: |
| 82 return True |
| 83 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \ |
| 84 'GENERATED' in header: |
| 85 return self.automatically_generated_re.search(header) |
| 86 return False |
81 | 87 |
82 | 88 |
83 GENERATED_FILE = 'GENERATED FILE' | 89 class _CopyrightsScanner(object): |
84 NO_COPYRIGHT = '*No copyright*' | 90 @staticmethod |
| 91 def StaticInit(input_api): |
| 92 _CopyrightsScanner._c_comment_re = \ |
| 93 input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''') |
| 94 _CopyrightsScanner._copyright_indicator = \ |
| 95 r'(?:copyright|copr\.|\xc2\xa9|\(c\))' |
| 96 _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile( |
| 97 r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \ |
| 98 r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE) |
| 99 _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile( |
| 100 r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE) |
85 | 101 |
86 class _CopyrightsScanner(object): | 102 def __init__(self, input_api): |
87 _c_comment_re = re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''') | |
88 _copyright_indicator = r'(?:copyright|copr\.|\xc2\xa9|\(c\))' | |
89 _full_copyright_indicator_re = \ | |
90 re.compile(r'(?:\W|^)' + _copyright_indicator + r'(?::\s*|\s+)(\w.*)$', \ | |
91 re.IGNORECASE) | |
92 _copyright_disindicator_re = \ | |
93 re.compile(r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', re.IGNORECASE) | |
94 | |
95 def __init__(self): | |
96 self.max_line_numbers_proximity = 3 | 103 self.max_line_numbers_proximity = 3 |
97 self.last_a_item_line_number = -200 | 104 self.last_a_item_line_number = -200 |
98 self.last_b_item_line_number = -100 | 105 self.last_b_item_line_number = -100 |
| 106 self.re = input_api.re |
99 | 107 |
100 def _CloseLineNumbers(self, a, b): | 108 def _CloseLineNumbers(self, a, b): |
101 return 0 <= a - b <= self.max_line_numbers_proximity | 109 return 0 <= a - b <= self.max_line_numbers_proximity |
102 | 110 |
103 def MatchLine(self, line_number, line): | 111 def MatchLine(self, line_number, line): |
104 if '"' in line: | 112 if '"' in line: |
105 line = _CopyrightsScanner._c_comment_re.sub('', line) | 113 line = _CopyrightsScanner._c_comment_re.sub('', line) |
106 upcase_line = line.upper() | 114 upcase_line = line.upper() |
107 # Record '(a)' and '(b)' last occurences in C++ comments. | 115 # Record '(a)' and '(b)' last occurences in C++ comments. |
108 # This is to filter out '(c)' used as a list item inside C++ comments. | 116 # This is to filter out '(c)' used as a list item inside C++ comments. |
(...skipping 15 matching lines...) Expand all Loading... |
124 self.last_b_item_line_number) and \ | 132 self.last_b_item_line_number) and \ |
125 self._CloseLineNumbers(self.last_b_item_line_number, | 133 self._CloseLineNumbers(self.last_b_item_line_number, |
126 self.last_a_item_line_number): | 134 self.last_a_item_line_number): |
127 return None | 135 return None |
128 copyr = None | 136 copyr = None |
129 m = _CopyrightsScanner._full_copyright_indicator_re.search(line) | 137 m = _CopyrightsScanner._full_copyright_indicator_re.search(line) |
130 if m and \ | 138 if m and \ |
131 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)): | 139 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)): |
132 copyr = m.group(0) | 140 copyr = m.group(0) |
133 # Prettify the authorship string. | 141 # Prettify the authorship string. |
134 copyr = re.sub(r'([,.])?\s*$/', '', copyr) | 142 copyr = self.re.sub(r'([,.])?\s*$/', '', copyr) |
135 copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE) | 143 copyr = self.re.sub( |
136 copyr = re.sub(r'^\s+', '', copyr) | 144 _CopyrightsScanner._copyright_indicator, '', copyr, \ |
137 copyr = re.sub(r'\s{2,}', ' ', copyr) | 145 flags=self.re.IGNORECASE) |
138 copyr = re.sub(r'\\@', '@', copyr) | 146 copyr = self.re.sub(r'^\s+', '', copyr) |
| 147 copyr = self.re.sub(r'\s{2,}', ' ', copyr) |
| 148 copyr = self.re.sub(r'\\@', '@', copyr) |
139 return copyr | 149 return copyr |
140 | 150 |
141 | 151 |
142 def FindCopyrights(root_dir, files_to_scan): | 152 def FindCopyrights(input_api, root_dir, files_to_scan): |
143 """Determines code autorship, and finds generated files. | 153 """Determines code autorship, and finds generated files. |
144 Args: | 154 Args: |
| 155 input_api: InputAPI, as in presubmit scripts. |
145 root_dir: The root directory, to which all other paths are relative. | 156 root_dir: The root directory, to which all other paths are relative. |
146 files_to_scan: The list of file names to scan. | 157 files_to_scan: The list of file names to scan. |
147 Returns: | 158 Returns: |
148 The list of copyrights associated with each of the files given. | 159 The list of copyrights associated with each of the files given. |
149 If the certain file is generated, the corresponding list consists a single | 160 If the certain file is generated, the corresponding list consists a single |
150 entry -- 'GENERATED_FILE' string. If the file has no copyright info, | 161 entry -- 'GENERATED_FILE' string. If the file has no copyright info, |
151 the corresponding list contains 'NO_COPYRIGHT' string. | 162 the corresponding list contains 'NO_COPYRIGHT' string. |
152 """ | 163 """ |
| 164 generated_files_detector = _GeneratedFilesDetector(input_api) |
| 165 _CopyrightsScanner.StaticInit(input_api) |
153 copyrights = [] | 166 copyrights = [] |
154 for file_name in files_to_scan: | 167 for file_name in files_to_scan: |
155 linenum = 0 | 168 linenum = 0 |
156 header = '' | 169 header = [] |
157 file_copyrights = [] | 170 file_copyrights = [] |
158 scanner = _CopyrightsScanner() | 171 scanner = _CopyrightsScanner(input_api) |
159 with open(os.path.join(root_dir, file_name), 'r') as f: | 172 contents = input_api.ReadFile( |
160 for l in f.readlines(): | 173 input_api.os_path.join(root_dir, file_name), 'r') |
161 linenum += 1 | 174 for l in contents.split('\n'): |
162 if linenum <= 25: | 175 linenum += 1 |
163 header += l | 176 if linenum <= 25: |
164 c = scanner.MatchLine(linenum, l) | 177 header.append(l) |
165 if c: | 178 c = scanner.MatchLine(linenum, l) |
166 file_copyrights.append(c) | 179 if c: |
167 if _IsGeneratedFile(header): | 180 file_copyrights.append(c) |
168 copyrights.append([GENERATED_FILE]) | 181 if generated_files_detector.IsGeneratedFile('\n'.join(header)): |
169 elif file_copyrights: | 182 copyrights.append([_GeneratedFilesDetector.GENERATED_FILE]) |
170 copyrights.append(file_copyrights) | 183 elif file_copyrights: |
171 else: | 184 copyrights.append(file_copyrights) |
172 copyrights.append([NO_COPYRIGHT]) | 185 else: |
| 186 copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT]) |
173 return copyrights | 187 return copyrights |
174 | 188 |
175 | 189 |
176 def FindCopyrightViolations(root_dir, files_to_scan): | 190 def FindCopyrightViolations(input_api, root_dir, files_to_scan): |
177 """Looks for files that are not belong exlusively to the Chromium Authors. | 191 """Looks for files that are not belong exlusively to the Chromium Authors. |
178 Args: | 192 Args: |
| 193 input_api: InputAPI, as in presubmit scripts. |
179 root_dir: The root directory, to which all other paths are relative. | 194 root_dir: The root directory, to which all other paths are relative. |
180 files_to_scan: The list of file names to scan. | 195 files_to_scan: The list of file names to scan. |
181 Returns: | 196 Returns: |
182 The list of file names that contain non-Chromium copyrights. | 197 The list of file names that contain non-Chromium copyrights. |
183 """ | 198 """ |
184 copyrights = FindCopyrights(root_dir, files_to_scan) | 199 copyrights = FindCopyrights(input_api, root_dir, files_to_scan) |
185 offending_files = [] | 200 offending_files = [] |
186 allowed_copyrights_re = re.compile( | 201 allowed_copyrights_re = input_api.re.compile( |
187 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. ' | 202 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. ' |
188 'All rights reserved.*)$') | 203 'All rights reserved.*)$') |
189 for f, cs in itertools.izip(files_to_scan, copyrights): | 204 for f, cs in itertools.izip(files_to_scan, copyrights): |
190 if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT: | 205 if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \ |
| 206 cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT: |
191 continue | 207 continue |
192 for c in cs: | 208 for c in cs: |
193 if not allowed_copyrights_re.match(c): | 209 if not allowed_copyrights_re.match(c): |
194 offending_files.append(os.path.normpath(f)) | 210 offending_files.append(input_api.os_path.normpath(f)) |
195 break | 211 break |
196 return offending_files | 212 return offending_files |
OLD | NEW |