Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 # Copyright 2014 The Chromium Authors. All rights reserved. | 1 # Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 """Utilities for scanning source files to determine code authorship. | 5 """Utilities for scanning source files to determine code authorship. |
| 6 """ | 6 """ |
| 7 | 7 |
| 8 import itertools | 8 import itertools |
| 9 import os | |
| 10 import re | |
| 11 | 9 |
| 12 | 10 |
| 13 def FindFiles(root_dir, start_paths_list, excluded_dirs_list): | 11 def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list): |
| 14 """Similar to UNIX utility find(1), searches for files in the directories. | 12 """Similar to UNIX utility find(1), searches for files in the directories. |
| 15 Automatically leaves out only source code files. | 13 Automatically leaves out only source code files. |
| 16 Args: | 14 Args: |
| 15 input_api: InputAPI, as in presubmit scripts. | |
| 17 root_dir: The root directory, to which all other paths are relative. | 16 root_dir: The root directory, to which all other paths are relative. |
| 18 start_paths_list: The list of paths to start search from. Each path can | 17 start_paths_list: The list of paths to start search from. Each path can |
| 19 be a file or a directory. | 18 be a file or a directory. |
| 20 excluded_dirs_list: The list of directories to skip. | 19 excluded_dirs_list: The list of directories to skip. |
| 21 Returns: | 20 Returns: |
| 22 The list of source code files found, relative to |root_dir|. | 21 The list of source code files found, relative to |root_dir|. |
| 23 """ | 22 """ |
| 24 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list] | 23 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list] |
| 25 def IsBlacklistedDir(d): | 24 def IsBlacklistedDir(d): |
| 26 for item in dirs_blacklist: | 25 for item in dirs_blacklist: |
| 27 if item in d: | 26 if item in d: |
| 28 return True | 27 return True |
| 29 return False | 28 return False |
| 30 | 29 |
| 31 files_whitelist_re = re.compile( | 30 files_whitelist_re = input_api.re.compile( |
| 32 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' | 31 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' |
| 33 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' | 32 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' |
| 34 '|tex|mli?)$') | 33 '|tex|mli?)$') |
| 35 files = [] | 34 files = [] |
| 36 | 35 |
| 37 base_path_len = len(root_dir) | 36 base_path_len = len(root_dir) |
| 38 for path in start_paths_list: | 37 for path in start_paths_list: |
| 39 full_path = os.path.join(root_dir, path) | 38 full_path = input_api.os_path.join(root_dir, path) |
| 40 if os.path.isfile(full_path): | 39 if input_api.os_path.isfile(full_path): |
| 41 if files_whitelist_re.search(path): | 40 if files_whitelist_re.search(path): |
| 42 files.append(path) | 41 files.append(path) |
| 43 else: | 42 else: |
| 44 for dirpath, dirnames, filenames in os.walk(full_path): | 43 for dirpath, dirnames, filenames in input_api.os_walk(full_path): |
| 45 # Remove excluded subdirs for faster scanning. | 44 # Remove excluded subdirs for faster scanning. |
| 46 for item in dirnames[:]: | 45 for item in dirnames[:]: |
| 47 if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]): | 46 if IsBlacklistedDir( |
| 47 input_api.os_path.join(dirpath, item)[base_path_len + 1:]): | |
| 48 dirnames.remove(item) | 48 dirnames.remove(item) |
| 49 for filename in filenames: | 49 for filename in filenames: |
| 50 filepath = os.path.join(dirpath, filename)[base_path_len + 1:] | 50 filepath = \ |
| 51 input_api.os_path.join(dirpath, filename)[base_path_len + 1:] | |
| 51 if files_whitelist_re.search(filepath) and \ | 52 if files_whitelist_re.search(filepath) and \ |
| 52 not IsBlacklistedDir(filepath): | 53 not IsBlacklistedDir(filepath): |
| 53 files.append(filepath) | 54 files.append(filepath) |
| 54 return files | 55 return files |
| 55 | 56 |
| 56 | 57 |
| 57 python_multiline_string_double_re = re.compile( | 58 class _GeneratedFilesDetector(object): |
| 58 r'"""[^"]*(?:"""|$)', flags=re.MULTILINE) | 59 GENERATED_FILE = 'GENERATED FILE' |
| 59 python_multiline_string_single_re = re.compile( | 60 NO_COPYRIGHT = '*No copyright*' |
| 60 r"'''[^']*(?:'''|$)", flags=re.MULTILINE) | |
| 61 automatically_generated_re = re.compile( | |
| 62 r'(All changes made in this file will be lost' | |
| 63 '|DO NOT (EDIT|delete this file)' | |
| 64 '|Generated (at|automatically|data)' | |
| 65 '|Automatically generated' | |
| 66 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE) | |
| 67 | 61 |
| 68 def _IsGeneratedFile(header): | 62 @staticmethod |
| 69 header = header.upper() | 63 def StaticInit(input_api): |
| 70 if '"""' in header: | 64 _GeneratedFilesDetector.python_multiline_string_double_re = \ |
|
mkosiba (inactive)
2014/10/21 15:41:22
wouldn't it be simpler to have these be instance m
mnaganov (inactive)
2014/10/22 09:27:29
Done.
But it's different for _CopyrightsScanner,
| |
| 71 header = python_multiline_string_double_re.sub('', header) | 65 input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE) |
| 72 if "'''" in header: | 66 _GeneratedFilesDetector.python_multiline_string_single_re = \ |
| 73 header = python_multiline_string_single_re.sub('', header) | 67 input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE) |
| 74 # First do simple strings lookup to save time. | 68 _GeneratedFilesDetector.automatically_generated_re = input_api.re.compile( |
| 75 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header: | 69 r'(All changes made in this file will be lost' |
| 76 return True | 70 '|DO NOT (EDIT|delete this file)' |
| 77 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \ | 71 '|Generated (at|automatically|data)' |
| 78 'GENERATED' in header: | 72 '|Automatically generated' |
| 79 return automatically_generated_re.search(header) | 73 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE) |
| 80 return False | 74 |
| 75 @staticmethod | |
| 76 def _IsGeneratedFile(header): | |
| 77 header = header.upper() | |
| 78 if '"""' in header: | |
| 79 header = _GeneratedFilesDetector.python_multiline_string_double_re.sub( | |
| 80 '', header) | |
| 81 if "'''" in header: | |
| 82 header = _GeneratedFilesDetector.python_multiline_string_single_re.sub( | |
| 83 '', header) | |
| 84 # First do simple strings lookup to save time. | |
| 85 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header: | |
| 86 return True | |
| 87 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \ | |
| 88 'GENERATED' in header: | |
| 89 return _GeneratedFilesDetector.automatically_generated_re.search(header) | |
| 90 return False | |
| 81 | 91 |
| 82 | 92 |
| 83 GENERATED_FILE = 'GENERATED FILE' | 93 class _CopyrightsScanner(object): |
| 84 NO_COPYRIGHT = '*No copyright*' | 94 @staticmethod |
| 95 def StaticInit(input_api): | |
| 96 _CopyrightsScanner._c_comment_re = \ | |
| 97 input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''') | |
| 98 _CopyrightsScanner._copyright_indicator = \ | |
| 99 r'(?:copyright|copr\.|\xc2\xa9|\(c\))' | |
| 100 _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile( | |
| 101 r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \ | |
| 102 r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE) | |
| 103 _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile( | |
| 104 r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE) | |
| 85 | 105 |
| 86 class _CopyrightsScanner(object): | 106 def __init__(self, input_api): |
| 87 _c_comment_re = re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''') | |
| 88 _copyright_indicator = r'(?:copyright|copr\.|\xc2\xa9|\(c\))' | |
| 89 _full_copyright_indicator_re = \ | |
| 90 re.compile(r'(?:\W|^)' + _copyright_indicator + r'(?::\s*|\s+)(\w.*)$', \ | |
| 91 re.IGNORECASE) | |
| 92 _copyright_disindicator_re = \ | |
| 93 re.compile(r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', re.IGNORECASE) | |
| 94 | |
| 95 def __init__(self): | |
| 96 self.max_line_numbers_proximity = 3 | 107 self.max_line_numbers_proximity = 3 |
| 97 self.last_a_item_line_number = -200 | 108 self.last_a_item_line_number = -200 |
| 98 self.last_b_item_line_number = -100 | 109 self.last_b_item_line_number = -100 |
| 110 self.re = input_api.re | |
| 99 | 111 |
| 100 def _CloseLineNumbers(self, a, b): | 112 def _CloseLineNumbers(self, a, b): |
| 101 return 0 <= a - b <= self.max_line_numbers_proximity | 113 return 0 <= a - b <= self.max_line_numbers_proximity |
| 102 | 114 |
| 103 def MatchLine(self, line_number, line): | 115 def MatchLine(self, line_number, line): |
| 104 if '"' in line: | 116 if '"' in line: |
| 105 line = _CopyrightsScanner._c_comment_re.sub('', line) | 117 line = _CopyrightsScanner._c_comment_re.sub('', line) |
| 106 upcase_line = line.upper() | 118 upcase_line = line.upper() |
| 107 # Record '(a)' and '(b)' last occurences in C++ comments. | 119 # Record '(a)' and '(b)' last occurences in C++ comments. |
| 108 # This is to filter out '(c)' used as a list item inside C++ comments. | 120 # This is to filter out '(c)' used as a list item inside C++ comments. |
| (...skipping 15 matching lines...) Expand all Loading... | |
| 124 self.last_b_item_line_number) and \ | 136 self.last_b_item_line_number) and \ |
| 125 self._CloseLineNumbers(self.last_b_item_line_number, | 137 self._CloseLineNumbers(self.last_b_item_line_number, |
| 126 self.last_a_item_line_number): | 138 self.last_a_item_line_number): |
| 127 return None | 139 return None |
| 128 copyr = None | 140 copyr = None |
| 129 m = _CopyrightsScanner._full_copyright_indicator_re.search(line) | 141 m = _CopyrightsScanner._full_copyright_indicator_re.search(line) |
| 130 if m and \ | 142 if m and \ |
| 131 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)): | 143 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)): |
| 132 copyr = m.group(0) | 144 copyr = m.group(0) |
| 133 # Prettify the authorship string. | 145 # Prettify the authorship string. |
| 134 copyr = re.sub(r'([,.])?\s*$/', '', copyr) | 146 copyr = self.re.sub(r'([,.])?\s*$/', '', copyr) |
| 135 copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE) | 147 copyr = self.re.sub( |
| 136 copyr = re.sub(r'^\s+', '', copyr) | 148 _CopyrightsScanner._copyright_indicator, '', copyr, \ |
| 137 copyr = re.sub(r'\s{2,}', ' ', copyr) | 149 flags=self.re.IGNORECASE) |
| 138 copyr = re.sub(r'\\@', '@', copyr) | 150 copyr = self.re.sub(r'^\s+', '', copyr) |
| 151 copyr = self.re.sub(r'\s{2,}', ' ', copyr) | |
| 152 copyr = self.re.sub(r'\\@', '@', copyr) | |
| 139 return copyr | 153 return copyr |
| 140 | 154 |
| 141 | 155 |
| 142 def FindCopyrights(root_dir, files_to_scan): | 156 def FindCopyrights(input_api, root_dir, files_to_scan): |
| 143 """Determines code autorship, and finds generated files. | 157 """Determines code autorship, and finds generated files. |
| 144 Args: | 158 Args: |
| 159 input_api: InputAPI, as in presubmit scripts. | |
| 145 root_dir: The root directory, to which all other paths are relative. | 160 root_dir: The root directory, to which all other paths are relative. |
| 146 files_to_scan: The list of file names to scan. | 161 files_to_scan: The list of file names to scan. |
| 147 Returns: | 162 Returns: |
| 148 The list of copyrights associated with each of the files given. | 163 The list of copyrights associated with each of the files given. |
| 149 If the certain file is generated, the corresponding list consists a single | 164 If the certain file is generated, the corresponding list consists a single |
| 150 entry -- 'GENERATED_FILE' string. If the file has no copyright info, | 165 entry -- 'GENERATED_FILE' string. If the file has no copyright info, |
| 151 the corresponding list contains 'NO_COPYRIGHT' string. | 166 the corresponding list contains 'NO_COPYRIGHT' string. |
| 152 """ | 167 """ |
| 168 _GeneratedFilesDetector.StaticInit(input_api) | |
| 169 _CopyrightsScanner.StaticInit(input_api) | |
| 153 copyrights = [] | 170 copyrights = [] |
| 154 for file_name in files_to_scan: | 171 for file_name in files_to_scan: |
| 155 linenum = 0 | 172 linenum = 0 |
| 156 header = '' | 173 header = [] |
| 157 file_copyrights = [] | 174 file_copyrights = [] |
| 158 scanner = _CopyrightsScanner() | 175 scanner = _CopyrightsScanner(input_api) |
| 159 with open(os.path.join(root_dir, file_name), 'r') as f: | 176 contents = input_api.ReadFile( |
| 160 for l in f.readlines(): | 177 input_api.os_path.join(root_dir, file_name), 'rb') |
|
mkosiba (inactive)
2014/10/21 15:41:22
is 'rb' intentional?
mnaganov (inactive)
2014/10/22 09:27:29
Not sure :) Let's stick with 'r', as before.
| |
| 161 linenum += 1 | 178 for l in contents.split('\n'): |
| 162 if linenum <= 25: | 179 linenum += 1 |
| 163 header += l | 180 if linenum <= 25: |
| 164 c = scanner.MatchLine(linenum, l) | 181 header.append(l) |
| 165 if c: | 182 c = scanner.MatchLine(linenum, l) |
| 166 file_copyrights.append(c) | 183 if c: |
| 167 if _IsGeneratedFile(header): | 184 file_copyrights.append(c) |
| 168 copyrights.append([GENERATED_FILE]) | 185 if _GeneratedFilesDetector._IsGeneratedFile('\n'.join(header)): |
| 169 elif file_copyrights: | 186 copyrights.append([_GeneratedFilesDetector.GENERATED_FILE]) |
| 170 copyrights.append(file_copyrights) | 187 elif file_copyrights: |
| 171 else: | 188 copyrights.append(file_copyrights) |
| 172 copyrights.append([NO_COPYRIGHT]) | 189 else: |
| 190 copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT]) | |
| 173 return copyrights | 191 return copyrights |
| 174 | 192 |
| 175 | 193 |
| 176 def FindCopyrightViolations(root_dir, files_to_scan): | 194 def FindCopyrightViolations(input_api, root_dir, files_to_scan): |
| 177 """Looks for files that are not belong exlusively to the Chromium Authors. | 195 """Looks for files that are not belong exlusively to the Chromium Authors. |
| 178 Args: | 196 Args: |
| 197 input_api: InputAPI, as in presubmit scripts. | |
| 179 root_dir: The root directory, to which all other paths are relative. | 198 root_dir: The root directory, to which all other paths are relative. |
| 180 files_to_scan: The list of file names to scan. | 199 files_to_scan: The list of file names to scan. |
| 181 Returns: | 200 Returns: |
| 182 The list of file names that contain non-Chromium copyrights. | 201 The list of file names that contain non-Chromium copyrights. |
| 183 """ | 202 """ |
| 184 copyrights = FindCopyrights(root_dir, files_to_scan) | 203 copyrights = FindCopyrights(input_api, root_dir, files_to_scan) |
| 185 offending_files = [] | 204 offending_files = [] |
| 186 allowed_copyrights_re = re.compile( | 205 allowed_copyrights_re = input_api.re.compile( |
| 187 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. ' | 206 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. ' |
| 188 'All rights reserved.*)$') | 207 'All rights reserved.*)$') |
| 189 for f, cs in itertools.izip(files_to_scan, copyrights): | 208 for f, cs in itertools.izip(files_to_scan, copyrights): |
| 190 if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT: | 209 if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \ |
| 210 cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT: | |
| 191 continue | 211 continue |
| 192 for c in cs: | 212 for c in cs: |
| 193 if not allowed_copyrights_re.match(c): | 213 if not allowed_copyrights_re.match(c): |
| 194 offending_files.append(os.path.normpath(f)) | 214 offending_files.append(input_api.os_path.normpath(f)) |
| 195 break | 215 break |
| 196 return offending_files | 216 return offending_files |
| OLD | NEW |