Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 # Copyright 2014 The Chromium Authors. All rights reserved. | |
| 2 # Use of this source code is governed by a BSD-style license that can be | |
| 3 # found in the LICENSE file. | |
| 4 | |
| 5 """Utilities for scanning source files to determine code authorship. | |
| 6 """ | |
| 7 | |
| 8 import itertools | |
| 9 import os | |
| 10 import re | |
| 11 | |
| 12 | |
| 13 def FindFiles(root_dir, start_paths_list, excluded_dirs_list): | |
| 14 """Similar to UNIX utility find(1), searches for files in the directories. | |
| 15 Automatically leaves out only source code files. | |
| 16 Args: | |
| 17 root_dir: The root directory, to which all other paths are relative. | |
| 18 start_paths_list: The list of paths to start search from. Each path can | |
| 19 be a file or a directory. | |
| 20 excluded_dirs_list: The list of directories to skip. | |
| 21 Returns: | |
| 22 The list of source code files found, relative to |root_dir|. | |
| 23 """ | |
| 24 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list] | |
| 25 def IsBlacklistedDir(d): | |
| 26 for item in dirs_blacklist: | |
| 27 if item in d: | |
| 28 return True | |
| 29 return False | |
| 30 | |
| 31 files_whitelist_re = re.compile( | |
| 32 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' | |
| 33 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' | |
| 34 '|tex|mli?)$') | |
| 35 files = [] | |
| 36 | |
| 37 base_path_len = len(root_dir) | |
| 38 for path in start_paths_list: | |
| 39 full_path = os.path.join(root_dir, path) | |
| 40 if os.path.isfile(full_path): | |
| 41 if files_whitelist_re.search(path): | |
| 42 files.append(path) | |
| 43 else: | |
| 44 for dirpath, dirnames, filenames in os.walk(full_path): | |
| 45 # Remove excluded subdirs for faster scanning. | |
| 46 for item in dirnames[:]: | |
| 47 if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]): | |
| 48 dirnames.remove(item) | |
|
mkosiba (inactive)
2014/10/02 13:14:46
the os.walk docs say:
mnaganov (inactive)
2014/10/02 13:51:30
Yeah, but the same doc also lists the defaults for
| |
| 49 for filename in filenames: | |
| 50 filepath = os.path.join(dirpath, filename)[base_path_len + 1:] | |
| 51 if files_whitelist_re.search(filepath) and \ | |
| 52 not IsBlacklistedDir(filepath): | |
| 53 files.append(filepath) | |
| 54 return files | |
| 55 | |
| 56 | |
| 57 python_multiline_string_double_re = re.compile( | |
| 58 r'"""[^"]*(?:"""|$)', flags=re.MULTILINE) | |
| 59 python_multiline_string_single_re = re.compile( | |
| 60 r"'''[^']*(?:'''|$)", flags=re.MULTILINE) | |
| 61 automatically_generated_re = re.compile( | |
| 62 r'(All changes made in this file will be lost' | |
| 63 '|DO NOT (EDIT|delete this file)' | |
| 64 '|Generated (at|automatically|data)' | |
| 65 '|Automatically generated' | |
| 66 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE) | |
| 67 | |
| 68 def _IsGeneratedFile(header): | |
| 69 header = header.upper() | |
| 70 if '"""' in header: | |
| 71 header = python_multiline_string_double_re.sub('', header) | |
| 72 if "'''" in header: | |
| 73 header = python_multiline_string_single_re.sub('', header) | |
| 74 # First do simple strings lookup to save time. | |
| 75 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header: | |
| 76 return True | |
| 77 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \ | |
| 78 'GENERATED' in header: | |
| 79 return automatically_generated_re.search(header) | |
| 80 return False | |
| 81 | |
| 82 | |
| 83 GENERATED_FILE = 'GENERATED FILE' | |
| 84 NO_COPYRIGHT = '*No copyright*' | |
| 85 | |
| 86 class _CopyrightsScanner(object): | |
| 87 _c_comment_re = re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''') | |
| 88 _copyright_indicator = r'(?:copyright|copr\.|\xc2\xa9|\(c\))' | |
| 89 _full_copyright_indicator_re = \ | |
| 90 re.compile(r'(?:\W|^)' + _copyright_indicator + r'(?::\s*|\s+)(\w.*)$', \ | |
| 91 re.IGNORECASE) | |
| 92 _copyright_disindicator_re = \ | |
| 93 re.compile(r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', re.IGNORECASE) | |
| 94 | |
| 95 def __init__(self): | |
| 96 self.max_line_numbers_proximity = 3 | |
| 97 self.last_a_item_line_number = -200 | |
| 98 self.last_b_item_line_number = -100 | |
| 99 | |
| 100 def _CloseLineNumbers(self, a, b): | |
| 101 return 0 <= a - b <= self.max_line_numbers_proximity | |
| 102 | |
| 103 def MatchLine(self, line_number, line): | |
| 104 if '"' in line: | |
| 105 line = _CopyrightsScanner._c_comment_re.sub('', line) | |
| 106 upcase_line = line.upper() | |
| 107 # Record '(a)' and '(b)' last occurences in C++ comments. | |
|
mkosiba (inactive)
2014/10/02 13:14:46
Maybe move/copy the explanation from line 121 to h
mnaganov (inactive)
2014/10/02 13:51:30
Done.
| |
| 108 cpp_comment_idx = upcase_line.find('//') | |
| 109 if not cpp_comment_idx == -1: | |
|
mkosiba (inactive)
2014/10/02 13:14:47
cpp_comment_idx != -1 or even '//' in upcase_line
mnaganov (inactive)
2014/10/02 13:51:30
We use the value of cpp_comment_idx if it's not -1
| |
| 110 if upcase_line.find('(A)') > cpp_comment_idx: | |
| 111 self.last_a_item_line_number = line_number | |
| 112 if upcase_line.find('(B)') > cpp_comment_idx: | |
| 113 self.last_b_item_line_number = line_number | |
| 114 # Fast bailout, uses the same patterns as _copyright_indicator regexp. | |
| 115 if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \ | |
| 116 and not '\xc2\xa9' in upcase_line: | |
| 117 c_item_index = upcase_line.find('(C)') | |
| 118 if c_item_index == -1: | |
| 119 return None | |
| 120 # Filter out 'c' used as a list item inside C++ comments. | |
| 121 # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah" | |
| 122 if c_item_index > cpp_comment_idx and \ | |
| 123 self._CloseLineNumbers(line_number, | |
| 124 self.last_b_item_line_number) and \ | |
| 125 self._CloseLineNumbers(self.last_b_item_line_number, | |
| 126 self.last_a_item_line_number): | |
| 127 return None | |
| 128 copyr = None | |
| 129 m = _CopyrightsScanner._full_copyright_indicator_re.search(line) | |
| 130 if m and \ | |
| 131 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)): | |
| 132 copyr = m.group(0) | |
| 133 # Prettify the authorship string. | |
| 134 copyr = re.sub(r'([,.])?\s*$/', '', copyr) | |
| 135 copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE) | |
| 136 copyr = re.sub(r'^\s+', '', copyr) | |
| 137 copyr = re.sub(r'\s{2,}', ' ', copyr) | |
| 138 copyr = re.sub(r'\\@', '@', copyr) | |
| 139 return copyr | |
| 140 | |
| 141 | |
| 142 def FindCopyrights(root_dir, files_to_scan): | |
| 143 """Determines code autorship, and finds generated files. | |
| 144 Args: | |
| 145 root_dir: The root directory, to which all other paths are relative. | |
| 146 files_to_scan: The list of file names to scan. | |
| 147 Returns: | |
| 148 The list of copyrights associated with each of the files given. | |
| 149 If the certain file is generated, the corresponding list consists a single | |
| 150 entry -- 'GENERATED_FILE' string. If the file has no copyright info, | |
| 151 the corresponding list contains 'NO_COPYRIGHT' string. | |
| 152 """ | |
| 153 copyrights = [] | |
| 154 for f in files_to_scan: | |
| 155 linenum = 0 | |
| 156 header = '' | |
| 157 file_copyrights = [] | |
| 158 scanner = _CopyrightsScanner() | |
| 159 for l in open(os.path.join(root_dir, f), 'r').readlines(): | |
|
mkosiba (inactive)
2014/10/02 13:14:46
umm.. you might run out of fd's if you don't close
mnaganov (inactive)
2014/10/02 13:51:30
That's a really good catch, thanks! Fixed here and
| |
| 160 linenum += 1 | |
| 161 if linenum <= 25: | |
| 162 header += l | |
| 163 c = scanner.MatchLine(linenum, l) | |
| 164 if c: | |
| 165 file_copyrights.append(c) | |
| 166 if _IsGeneratedFile(header): | |
| 167 copyrights.append([GENERATED_FILE]) | |
| 168 elif file_copyrights: | |
| 169 copyrights.append(file_copyrights) | |
| 170 else: | |
| 171 copyrights.append([NO_COPYRIGHT]) | |
| 172 return copyrights | |
| 173 | |
| 174 | |
| 175 def FindCopyrightViolations(root_dir, files_to_scan): | |
| 176 """Looks for files that are not belong exlusively to the Chromium Authors. | |
| 177 Args: | |
| 178 root_dir: The root directory, to which all other paths are relative. | |
| 179 files_to_scan: The list of file names to scan. | |
| 180 Returns: | |
| 181 The list of file names that contain non-Chromium copyrights. | |
| 182 """ | |
| 183 copyrights = FindCopyrights(root_dir, files_to_scan) | |
| 184 offending_files = [] | |
| 185 allowed_copyrights_re = re.compile( | |
| 186 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. ' | |
| 187 'All rights reserved.*)$') | |
| 188 for f, cs in itertools.izip(files_to_scan, copyrights): | |
| 189 if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT: | |
| 190 continue | |
| 191 for c in cs: | |
| 192 if not allowed_copyrights_re.match(c): | |
| 193 offending_files.append(os.path.normpath(f)) | |
| 194 break | |
| 195 return offending_files | |
| OLD | NEW |