Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(32)

Side by Side Diff: android_webview/tools/copyright_scanner.py

Issue 622493004: [Android WebView] Rewrite copyrights scanner in Python (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Add license header to the manual test script Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | android_webview/tools/find_copyrights.pl » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4
5 """Utilities for scanning source files to determine code authorship.
6 """
7
8 import itertools
9 import os
10 import re
11
12
13 def FindFiles(root_dir, start_paths_list, excluded_dirs_list):
14 """Similar to UNIX utility find(1), searches for files in the directories.
15 Automatically leaves out only source code files.
16 Args:
17 root_dir: The root directory, to which all other paths are relative.
18 start_paths_list: The list of paths to start search from. Each path can
19 be a file or a directory.
20 excluded_dirs_list: The list of directories to skip.
21 Returns:
22 The list of source code files found, relative to |root_dir|.
23 """
24 dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]
25 def IsBlacklistedDir(d):
26 for item in dirs_blacklist:
27 if item in d:
28 return True
29 return False
30
31 files_whitelist_re = re.compile(
32 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
33 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
34 '|tex|mli?)$')
35 files = []
36
37 base_path_len = len(root_dir)
38 for path in start_paths_list:
39 full_path = os.path.join(root_dir, path)
40 if os.path.isfile(full_path):
41 if files_whitelist_re.search(path):
42 files.append(path)
43 else:
44 for dirpath, dirnames, filenames in os.walk(full_path):
45 # Remove excluded subdirs for faster scanning.
46 for item in dirnames[:]:
47 if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]):
48 dirnames.remove(item)
49 for filename in filenames:
50 filepath = os.path.join(dirpath, filename)[base_path_len + 1:]
51 if files_whitelist_re.search(filepath) and \
52 not IsBlacklistedDir(filepath):
53 files.append(filepath)
54 return files
55
56
57 python_multiline_string_double_re = re.compile(
58 r'"""[^"]*(?:"""|$)', flags=re.MULTILINE)
59 python_multiline_string_single_re = re.compile(
60 r"'''[^']*(?:'''|$)", flags=re.MULTILINE)
61 automatically_generated_re = re.compile(
62 r'(All changes made in this file will be lost'
63 '|DO NOT (EDIT|delete this file)'
64 '|Generated (at|automatically|data)'
65 '|Automatically generated'
66 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE)
67
68 def _IsGeneratedFile(header):
69 header = header.upper()
70 if '"""' in header:
71 header = python_multiline_string_double_re.sub('', header)
72 if "'''" in header:
73 header = python_multiline_string_single_re.sub('', header)
74 # First do simple strings lookup to save time.
75 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
76 return True
77 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
78 'GENERATED' in header:
79 return automatically_generated_re.search(header)
80 return False
81
82
83 GENERATED_FILE = 'GENERATED FILE'
84 NO_COPYRIGHT = '*No copyright*'
85
86 class _CopyrightsScanner(object):
87 _c_comment_re = re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
88 _copyright_indicator = r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
89 _full_copyright_indicator_re = \
90 re.compile(r'(?:\W|^)' + _copyright_indicator + r'(?::\s*|\s+)(\w.*)$', \
91 re.IGNORECASE)
92 _copyright_disindicator_re = \
93 re.compile(r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', re.IGNORECASE)
94
95 def __init__(self):
96 self.max_line_numbers_proximity = 3
97 self.last_a_item_line_number = -200
98 self.last_b_item_line_number = -100
99
100 def _CloseLineNumbers(self, a, b):
101 return 0 <= a - b <= self.max_line_numbers_proximity
102
103 def MatchLine(self, line_number, line):
104 if '"' in line:
105 line = _CopyrightsScanner._c_comment_re.sub('', line)
106 upcase_line = line.upper()
107 # Record '(a)' and '(b)' last occurences in C++ comments.
108 # This is to filter out '(c)' used as a list item inside C++ comments.
109 # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
110 cpp_comment_idx = upcase_line.find('//')
111 if cpp_comment_idx != -1:
112 if upcase_line.find('(A)') > cpp_comment_idx:
113 self.last_a_item_line_number = line_number
114 if upcase_line.find('(B)') > cpp_comment_idx:
115 self.last_b_item_line_number = line_number
116 # Fast bailout, uses the same patterns as _copyright_indicator regexp.
117 if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \
118 and not '\xc2\xa9' in upcase_line:
119 c_item_index = upcase_line.find('(C)')
120 if c_item_index == -1:
121 return None
122 if c_item_index > cpp_comment_idx and \
123 self._CloseLineNumbers(line_number,
124 self.last_b_item_line_number) and \
125 self._CloseLineNumbers(self.last_b_item_line_number,
126 self.last_a_item_line_number):
127 return None
128 copyr = None
129 m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
130 if m and \
131 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
132 copyr = m.group(0)
133 # Prettify the authorship string.
134 copyr = re.sub(r'([,.])?\s*$/', '', copyr)
135 copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE)
136 copyr = re.sub(r'^\s+', '', copyr)
137 copyr = re.sub(r'\s{2,}', ' ', copyr)
138 copyr = re.sub(r'\\@', '@', copyr)
139 return copyr
140
141
142 def FindCopyrights(root_dir, files_to_scan):
143 """Determines code autorship, and finds generated files.
144 Args:
145 root_dir: The root directory, to which all other paths are relative.
146 files_to_scan: The list of file names to scan.
147 Returns:
148 The list of copyrights associated with each of the files given.
149 If the certain file is generated, the corresponding list consists a single
150 entry -- 'GENERATED_FILE' string. If the file has no copyright info,
151 the corresponding list contains 'NO_COPYRIGHT' string.
152 """
153 copyrights = []
154 for file_name in files_to_scan:
155 linenum = 0
156 header = ''
157 file_copyrights = []
158 scanner = _CopyrightsScanner()
159 with open(os.path.join(root_dir, file_name), 'r') as f:
160 for l in f.readlines():
161 linenum += 1
162 if linenum <= 25:
163 header += l
164 c = scanner.MatchLine(linenum, l)
165 if c:
166 file_copyrights.append(c)
167 if _IsGeneratedFile(header):
168 copyrights.append([GENERATED_FILE])
169 elif file_copyrights:
170 copyrights.append(file_copyrights)
171 else:
172 copyrights.append([NO_COPYRIGHT])
173 return copyrights
174
175
176 def FindCopyrightViolations(root_dir, files_to_scan):
177 """Looks for files that are not belong exlusively to the Chromium Authors.
178 Args:
179 root_dir: The root directory, to which all other paths are relative.
180 files_to_scan: The list of file names to scan.
181 Returns:
182 The list of file names that contain non-Chromium copyrights.
183 """
184 copyrights = FindCopyrights(root_dir, files_to_scan)
185 offending_files = []
186 allowed_copyrights_re = re.compile(
187 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
188 'All rights reserved.*)$')
189 for f, cs in itertools.izip(files_to_scan, copyrights):
190 if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT:
191 continue
192 for c in cs:
193 if not allowed_copyrights_re.match(c):
194 offending_files.append(os.path.normpath(f))
195 break
196 return offending_files
OLDNEW
« no previous file with comments | « no previous file | android_webview/tools/find_copyrights.pl » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698