OLD | NEW |
(Empty) | |
| 1 # Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. |
| 4 |
| 5 """Utilities for scanning source files to determine code authorship. |
| 6 """ |
| 7 |
| 8 import itertools |
| 9 import os |
| 10 import re |
| 11 |
| 12 |
| 13 def FindFiles(root_dir, start_paths_list, excluded_dirs_list): |
| 14 """Similar to UNIX utility find(1), searches for files in the directories. |
| 15 Automatically leaves out only source code files. |
| 16 Args: |
| 17 root_dir: The root directory, to which all other paths are relative. |
| 18 start_paths_list: The list of paths to start search from. Each path can |
| 19 be a file or a directory. |
| 20 excluded_dirs_list: The list of directories to skip. |
| 21 Returns: |
| 22 The list of source code files found, relative to |root_dir|. |
| 23 """ |
| 24 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list] |
| 25 def IsBlacklistedDir(d): |
| 26 for item in dirs_blacklist: |
| 27 if item in d: |
| 28 return True |
| 29 return False |
| 30 |
| 31 files_whitelist_re = re.compile( |
| 32 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' |
| 33 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' |
| 34 '|tex|mli?)$') |
| 35 files = [] |
| 36 |
| 37 base_path_len = len(root_dir) |
| 38 for path in start_paths_list: |
| 39 full_path = os.path.join(root_dir, path) |
| 40 if os.path.isfile(full_path): |
| 41 if files_whitelist_re.search(path): |
| 42 files.append(path) |
| 43 else: |
| 44 for dirpath, dirnames, filenames in os.walk(full_path): |
| 45 # Remove excluded subdirs for faster scanning. |
| 46 for item in dirnames[:]: |
| 47 if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]): |
| 48 dirnames.remove(item) |
| 49 for filename in filenames: |
| 50 filepath = os.path.join(dirpath, filename)[base_path_len + 1:] |
| 51 if files_whitelist_re.search(filepath) and \ |
| 52 not IsBlacklistedDir(filepath): |
| 53 files.append(filepath) |
| 54 return files |
| 55 |
| 56 |
| 57 python_multiline_string_double_re = re.compile( |
| 58 r'"""[^"]*(?:"""|$)', flags=re.MULTILINE) |
| 59 python_multiline_string_single_re = re.compile( |
| 60 r"'''[^']*(?:'''|$)", flags=re.MULTILINE) |
| 61 automatically_generated_re = re.compile( |
| 62 r'(All changes made in this file will be lost' |
| 63 '|DO NOT (EDIT|delete this file)' |
| 64 '|Generated (at|automatically|data)' |
| 65 '|Automatically generated' |
| 66 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE) |
| 67 |
| 68 def _IsGeneratedFile(header): |
| 69 header = header.upper() |
| 70 if '"""' in header: |
| 71 header = python_multiline_string_double_re.sub('', header) |
| 72 if "'''" in header: |
| 73 header = python_multiline_string_single_re.sub('', header) |
| 74 # First do simple strings lookup to save time. |
| 75 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header: |
| 76 return True |
| 77 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \ |
| 78 'GENERATED' in header: |
| 79 return automatically_generated_re.search(header) |
| 80 return False |
| 81 |
| 82 |
| 83 GENERATED_FILE = 'GENERATED FILE' |
| 84 NO_COPYRIGHT = '*No copyright*' |
| 85 |
| 86 class _CopyrightsScanner(object): |
| 87 _c_comment_re = re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''') |
| 88 _copyright_indicator = r'(?:copyright|copr\.|\xc2\xa9|\(c\))' |
| 89 _full_copyright_indicator_re = \ |
| 90 re.compile(r'(?:\W|^)' + _copyright_indicator + r'(?::\s*|\s+)(\w.*)$', \ |
| 91 re.IGNORECASE) |
| 92 _copyright_disindicator_re = \ |
| 93 re.compile(r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', re.IGNORECASE) |
| 94 |
| 95 def __init__(self): |
| 96 self.max_line_numbers_proximity = 3 |
| 97 self.last_a_item_line_number = -200 |
| 98 self.last_b_item_line_number = -100 |
| 99 |
| 100 def _CloseLineNumbers(self, a, b): |
| 101 return 0 <= a - b <= self.max_line_numbers_proximity |
| 102 |
| 103 def MatchLine(self, line_number, line): |
| 104 if '"' in line: |
| 105 line = _CopyrightsScanner._c_comment_re.sub('', line) |
| 106 upcase_line = line.upper() |
| 107 # Record '(a)' and '(b)' last occurences in C++ comments. |
| 108 # This is to filter out '(c)' used as a list item inside C++ comments. |
| 109 # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah" |
| 110 cpp_comment_idx = upcase_line.find('//') |
| 111 if cpp_comment_idx != -1: |
| 112 if upcase_line.find('(A)') > cpp_comment_idx: |
| 113 self.last_a_item_line_number = line_number |
| 114 if upcase_line.find('(B)') > cpp_comment_idx: |
| 115 self.last_b_item_line_number = line_number |
| 116 # Fast bailout, uses the same patterns as _copyright_indicator regexp. |
| 117 if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \ |
| 118 and not '\xc2\xa9' in upcase_line: |
| 119 c_item_index = upcase_line.find('(C)') |
| 120 if c_item_index == -1: |
| 121 return None |
| 122 if c_item_index > cpp_comment_idx and \ |
| 123 self._CloseLineNumbers(line_number, |
| 124 self.last_b_item_line_number) and \ |
| 125 self._CloseLineNumbers(self.last_b_item_line_number, |
| 126 self.last_a_item_line_number): |
| 127 return None |
| 128 copyr = None |
| 129 m = _CopyrightsScanner._full_copyright_indicator_re.search(line) |
| 130 if m and \ |
| 131 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)): |
| 132 copyr = m.group(0) |
| 133 # Prettify the authorship string. |
| 134 copyr = re.sub(r'([,.])?\s*$/', '', copyr) |
| 135 copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE) |
| 136 copyr = re.sub(r'^\s+', '', copyr) |
| 137 copyr = re.sub(r'\s{2,}', ' ', copyr) |
| 138 copyr = re.sub(r'\\@', '@', copyr) |
| 139 return copyr |
| 140 |
| 141 |
| 142 def FindCopyrights(root_dir, files_to_scan): |
| 143 """Determines code autorship, and finds generated files. |
| 144 Args: |
| 145 root_dir: The root directory, to which all other paths are relative. |
| 146 files_to_scan: The list of file names to scan. |
| 147 Returns: |
| 148 The list of copyrights associated with each of the files given. |
| 149 If the certain file is generated, the corresponding list consists a single |
| 150 entry -- 'GENERATED_FILE' string. If the file has no copyright info, |
| 151 the corresponding list contains 'NO_COPYRIGHT' string. |
| 152 """ |
| 153 copyrights = [] |
| 154 for file_name in files_to_scan: |
| 155 linenum = 0 |
| 156 header = '' |
| 157 file_copyrights = [] |
| 158 scanner = _CopyrightsScanner() |
| 159 with open(os.path.join(root_dir, file_name), 'r') as f: |
| 160 for l in f.readlines(): |
| 161 linenum += 1 |
| 162 if linenum <= 25: |
| 163 header += l |
| 164 c = scanner.MatchLine(linenum, l) |
| 165 if c: |
| 166 file_copyrights.append(c) |
| 167 if _IsGeneratedFile(header): |
| 168 copyrights.append([GENERATED_FILE]) |
| 169 elif file_copyrights: |
| 170 copyrights.append(file_copyrights) |
| 171 else: |
| 172 copyrights.append([NO_COPYRIGHT]) |
| 173 return copyrights |
| 174 |
| 175 |
| 176 def FindCopyrightViolations(root_dir, files_to_scan): |
| 177 """Looks for files that are not belong exlusively to the Chromium Authors. |
| 178 Args: |
| 179 root_dir: The root directory, to which all other paths are relative. |
| 180 files_to_scan: The list of file names to scan. |
| 181 Returns: |
| 182 The list of file names that contain non-Chromium copyrights. |
| 183 """ |
| 184 copyrights = FindCopyrights(root_dir, files_to_scan) |
| 185 offending_files = [] |
| 186 allowed_copyrights_re = re.compile( |
| 187 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. ' |
| 188 'All rights reserved.*)$') |
| 189 for f, cs in itertools.izip(files_to_scan, copyrights): |
| 190 if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT: |
| 191 continue |
| 192 for c in cs: |
| 193 if not allowed_copyrights_re.match(c): |
| 194 offending_files.append(os.path.normpath(f)) |
| 195 break |
| 196 return offending_files |
OLD | NEW |