OLD | NEW |
---|---|
(Empty) | |
1 # Copyright 2014 The Chromium Authors. All rights reserved. | |
2 # Use of this source code is governed by a BSD-style license that can be | |
3 # found in the LICENSE file. | |
4 | |
5 """Utilities for scanning source files to determine code authorship. | |
6 """ | |
7 | |
8 import itertools | |
9 import os | |
10 import re | |
11 | |
12 | |
13 def FindFiles(root_dir, start_paths_list, excluded_dirs_list): | |
14 """Similar to UNIX utility find(1), searches for files in the directories. | |
15 Automatically leaves out only source code files. | |
16 Args: | |
17 root_dir: The root directory, to which all other paths are relative. | |
18 start_paths_list: The list of paths to start search from. Each path can | |
19 be a file or a directory. | |
20 excluded_dirs_list: The list of directories to skip. | |
21 Returns: | |
22 The list of source code files found, relative to |root_dir|. | |
23 """ | |
24 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list] | |
25 def IsBlacklistedDir(d): | |
26 for item in dirs_blacklist: | |
27 if item in d: | |
28 return True | |
29 return False | |
30 | |
31 files_whitelist_re = re.compile( | |
32 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' | |
33 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' | |
34 '|tex|mli?)$') | |
35 files = [] | |
36 | |
37 base_path_len = len(root_dir) | |
38 for path in start_paths_list: | |
39 full_path = os.path.join(root_dir, path) | |
40 if os.path.isfile(full_path): | |
41 if files_whitelist_re.search(path): | |
42 files.append(path) | |
43 else: | |
44 for dirpath, dirnames, filenames in os.walk(full_path): | |
45 # Remove excluded subdirs for faster scanning. | |
46 for item in dirnames[:]: | |
47 if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]): | |
48 dirnames.remove(item) | |
mkosiba (inactive)
2014/10/02 13:14:46
the os.walk docs say:
mnaganov (inactive)
2014/10/02 13:51:30
Yeah, but the same doc also lists the defaults for
| |
49 for filename in filenames: | |
50 filepath = os.path.join(dirpath, filename)[base_path_len + 1:] | |
51 if files_whitelist_re.search(filepath) and \ | |
52 not IsBlacklistedDir(filepath): | |
53 files.append(filepath) | |
54 return files | |
55 | |
56 | |
57 python_multiline_string_double_re = re.compile( | |
58 r'"""[^"]*(?:"""|$)', flags=re.MULTILINE) | |
59 python_multiline_string_single_re = re.compile( | |
60 r"'''[^']*(?:'''|$)", flags=re.MULTILINE) | |
61 automatically_generated_re = re.compile( | |
62 r'(All changes made in this file will be lost' | |
63 '|DO NOT (EDIT|delete this file)' | |
64 '|Generated (at|automatically|data)' | |
65 '|Automatically generated' | |
66 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE) | |
67 | |
68 def _IsGeneratedFile(header): | |
69 header = header.upper() | |
70 if '"""' in header: | |
71 header = python_multiline_string_double_re.sub('', header) | |
72 if "'''" in header: | |
73 header = python_multiline_string_single_re.sub('', header) | |
74 # First do simple strings lookup to save time. | |
75 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header: | |
76 return True | |
77 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \ | |
78 'GENERATED' in header: | |
79 return automatically_generated_re.search(header) | |
80 return False | |
81 | |
82 | |
83 GENERATED_FILE = 'GENERATED FILE' | |
84 NO_COPYRIGHT = '*No copyright*' | |
85 | |
86 class _CopyrightsScanner(object): | |
87 _c_comment_re = re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''') | |
88 _copyright_indicator = r'(?:copyright|copr\.|\xc2\xa9|\(c\))' | |
89 _full_copyright_indicator_re = \ | |
90 re.compile(r'(?:\W|^)' + _copyright_indicator + r'(?::\s*|\s+)(\w.*)$', \ | |
91 re.IGNORECASE) | |
92 _copyright_disindicator_re = \ | |
93 re.compile(r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', re.IGNORECASE) | |
94 | |
95 def __init__(self): | |
96 self.max_line_numbers_proximity = 3 | |
97 self.last_a_item_line_number = -200 | |
98 self.last_b_item_line_number = -100 | |
99 | |
100 def _CloseLineNumbers(self, a, b): | |
101 return 0 <= a - b <= self.max_line_numbers_proximity | |
102 | |
103 def MatchLine(self, line_number, line): | |
104 if '"' in line: | |
105 line = _CopyrightsScanner._c_comment_re.sub('', line) | |
106 upcase_line = line.upper() | |
107 # Record '(a)' and '(b)' last occurences in C++ comments. | |
mkosiba (inactive)
2014/10/02 13:14:46
Maybe move/copy the explanation from line 121 to h
mnaganov (inactive)
2014/10/02 13:51:30
Done.
| |
108 cpp_comment_idx = upcase_line.find('//') | |
109 if not cpp_comment_idx == -1: | |
mkosiba (inactive)
2014/10/02 13:14:47
cpp_comment_idx != -1 or even '//' in upcase_line
mnaganov (inactive)
2014/10/02 13:51:30
We use the value of cpp_comment_idx if it's not -1
| |
110 if upcase_line.find('(A)') > cpp_comment_idx: | |
111 self.last_a_item_line_number = line_number | |
112 if upcase_line.find('(B)') > cpp_comment_idx: | |
113 self.last_b_item_line_number = line_number | |
114 # Fast bailout, uses the same patterns as _copyright_indicator regexp. | |
115 if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \ | |
116 and not '\xc2\xa9' in upcase_line: | |
117 c_item_index = upcase_line.find('(C)') | |
118 if c_item_index == -1: | |
119 return None | |
120 # Filter out 'c' used as a list item inside C++ comments. | |
121 # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah" | |
122 if c_item_index > cpp_comment_idx and \ | |
123 self._CloseLineNumbers(line_number, | |
124 self.last_b_item_line_number) and \ | |
125 self._CloseLineNumbers(self.last_b_item_line_number, | |
126 self.last_a_item_line_number): | |
127 return None | |
128 copyr = None | |
129 m = _CopyrightsScanner._full_copyright_indicator_re.search(line) | |
130 if m and \ | |
131 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)): | |
132 copyr = m.group(0) | |
133 # Prettify the authorship string. | |
134 copyr = re.sub(r'([,.])?\s*$/', '', copyr) | |
135 copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE) | |
136 copyr = re.sub(r'^\s+', '', copyr) | |
137 copyr = re.sub(r'\s{2,}', ' ', copyr) | |
138 copyr = re.sub(r'\\@', '@', copyr) | |
139 return copyr | |
140 | |
141 | |
142 def FindCopyrights(root_dir, files_to_scan): | |
143 """Determines code autorship, and finds generated files. | |
144 Args: | |
145 root_dir: The root directory, to which all other paths are relative. | |
146 files_to_scan: The list of file names to scan. | |
147 Returns: | |
148 The list of copyrights associated with each of the files given. | |
149 If the certain file is generated, the corresponding list consists a single | |
150 entry -- 'GENERATED_FILE' string. If the file has no copyright info, | |
151 the corresponding list contains 'NO_COPYRIGHT' string. | |
152 """ | |
153 copyrights = [] | |
154 for f in files_to_scan: | |
155 linenum = 0 | |
156 header = '' | |
157 file_copyrights = [] | |
158 scanner = _CopyrightsScanner() | |
159 for l in open(os.path.join(root_dir, f), 'r').readlines(): | |
mkosiba (inactive)
2014/10/02 13:14:46
umm.. you might run out of fd's if you don't close
mnaganov (inactive)
2014/10/02 13:51:30
That's a really good catch, thanks! Fixed here and
| |
160 linenum += 1 | |
161 if linenum <= 25: | |
162 header += l | |
163 c = scanner.MatchLine(linenum, l) | |
164 if c: | |
165 file_copyrights.append(c) | |
166 if _IsGeneratedFile(header): | |
167 copyrights.append([GENERATED_FILE]) | |
168 elif file_copyrights: | |
169 copyrights.append(file_copyrights) | |
170 else: | |
171 copyrights.append([NO_COPYRIGHT]) | |
172 return copyrights | |
173 | |
174 | |
175 def FindCopyrightViolations(root_dir, files_to_scan): | |
176 """Looks for files that are not belong exlusively to the Chromium Authors. | |
177 Args: | |
178 root_dir: The root directory, to which all other paths are relative. | |
179 files_to_scan: The list of file names to scan. | |
180 Returns: | |
181 The list of file names that contain non-Chromium copyrights. | |
182 """ | |
183 copyrights = FindCopyrights(root_dir, files_to_scan) | |
184 offending_files = [] | |
185 allowed_copyrights_re = re.compile( | |
186 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. ' | |
187 'All rights reserved.*)$') | |
188 for f, cs in itertools.izip(files_to_scan, copyrights): | |
189 if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT: | |
190 continue | |
191 for c in cs: | |
192 if not allowed_copyrights_re.match(c): | |
193 offending_files.append(os.path.normpath(f)) | |
194 break | |
195 return offending_files | |
OLD | NEW |