Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2862)

Unified Diff: android_webview/tools/copyright_scanner.py

Issue 622493004: [Android WebView] Rewrite copyrights scanner in Python (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Add license header to the manual test script Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | android_webview/tools/find_copyrights.pl » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: android_webview/tools/copyright_scanner.py
diff --git a/android_webview/tools/copyright_scanner.py b/android_webview/tools/copyright_scanner.py
new file mode 100644
index 0000000000000000000000000000000000000000..90da30ded7303121fa678c01bd683061f75bc6fc
--- /dev/null
+++ b/android_webview/tools/copyright_scanner.py
@@ -0,0 +1,196 @@
+# Copyright 2014 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Utilities for scanning source files to determine code authorship.
+"""
+
+import itertools
+import os
+import re
+
+
+def FindFiles(root_dir, start_paths_list, excluded_dirs_list):
+ """Similar to UNIX utility find(1), searches for files in the directories.
+ Automatically leaves out only source code files.
+ Args:
+ root_dir: The root directory, to which all other paths are relative.
+ start_paths_list: The list of paths to start search from. Each path can
+ be a file or a directory.
+ excluded_dirs_list: The list of directories to skip.
+ Returns:
+ The list of source code files found, relative to |root_dir|.
+ """
+ dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]
+ def IsBlacklistedDir(d):
+ for item in dirs_blacklist:
+ if item in d:
+ return True
+ return False
+
+ files_whitelist_re = re.compile(
+ r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
+ '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
+ '|tex|mli?)$')
+ files = []
+
+ base_path_len = len(root_dir)
+ for path in start_paths_list:
+ full_path = os.path.join(root_dir, path)
+ if os.path.isfile(full_path):
+ if files_whitelist_re.search(path):
+ files.append(path)
+ else:
+ for dirpath, dirnames, filenames in os.walk(full_path):
+ # Remove excluded subdirs for faster scanning.
+ for item in dirnames[:]:
+ if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]):
+ dirnames.remove(item)
+ for filename in filenames:
+ filepath = os.path.join(dirpath, filename)[base_path_len + 1:]
+ if files_whitelist_re.search(filepath) and \
+ not IsBlacklistedDir(filepath):
+ files.append(filepath)
+ return files
+
+
+python_multiline_string_double_re = re.compile(
+ r'"""[^"]*(?:"""|$)', flags=re.MULTILINE)
+python_multiline_string_single_re = re.compile(
+ r"'''[^']*(?:'''|$)", flags=re.MULTILINE)
+automatically_generated_re = re.compile(
+ r'(All changes made in this file will be lost'
+ '|DO NOT (EDIT|delete this file)'
+ '|Generated (at|automatically|data)'
+ '|Automatically generated'
+ '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE)
+
+def _IsGeneratedFile(header):
+ header = header.upper()
+ if '"""' in header:
+ header = python_multiline_string_double_re.sub('', header)
+ if "'''" in header:
+ header = python_multiline_string_single_re.sub('', header)
+ # First do simple strings lookup to save time.
+ if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
+ return True
+ if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
+ 'GENERATED' in header:
+ return automatically_generated_re.search(header)
+ return False
+
+
+GENERATED_FILE = 'GENERATED FILE'
+NO_COPYRIGHT = '*No copyright*'
+
+class _CopyrightsScanner(object):
+ _c_comment_re = re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
+ _copyright_indicator = r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
+ _full_copyright_indicator_re = \
+ re.compile(r'(?:\W|^)' + _copyright_indicator + r'(?::\s*|\s+)(\w.*)$', \
+ re.IGNORECASE)
+ _copyright_disindicator_re = \
+ re.compile(r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', re.IGNORECASE)
+
+ def __init__(self):
+ self.max_line_numbers_proximity = 3
+ self.last_a_item_line_number = -200
+ self.last_b_item_line_number = -100
+
+ def _CloseLineNumbers(self, a, b):
+ return 0 <= a - b <= self.max_line_numbers_proximity
+
+ def MatchLine(self, line_number, line):
+ if '"' in line:
+ line = _CopyrightsScanner._c_comment_re.sub('', line)
+ upcase_line = line.upper()
+ # Record '(a)' and '(b)' last occurences in C++ comments.
+ # This is to filter out '(c)' used as a list item inside C++ comments.
+ # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
+ cpp_comment_idx = upcase_line.find('//')
+ if cpp_comment_idx != -1:
+ if upcase_line.find('(A)') > cpp_comment_idx:
+ self.last_a_item_line_number = line_number
+ if upcase_line.find('(B)') > cpp_comment_idx:
+ self.last_b_item_line_number = line_number
+ # Fast bailout, uses the same patterns as _copyright_indicator regexp.
+ if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \
+ and not '\xc2\xa9' in upcase_line:
+ c_item_index = upcase_line.find('(C)')
+ if c_item_index == -1:
+ return None
+ if c_item_index > cpp_comment_idx and \
+ self._CloseLineNumbers(line_number,
+ self.last_b_item_line_number) and \
+ self._CloseLineNumbers(self.last_b_item_line_number,
+ self.last_a_item_line_number):
+ return None
+ copyr = None
+ m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
+ if m and \
+ not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
+ copyr = m.group(0)
+ # Prettify the authorship string.
+ copyr = re.sub(r'([,.])?\s*$/', '', copyr)
+ copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE)
+ copyr = re.sub(r'^\s+', '', copyr)
+ copyr = re.sub(r'\s{2,}', ' ', copyr)
+ copyr = re.sub(r'\\@', '@', copyr)
+ return copyr
+
+
+def FindCopyrights(root_dir, files_to_scan):
+ """Determines code autorship, and finds generated files.
+ Args:
+ root_dir: The root directory, to which all other paths are relative.
+ files_to_scan: The list of file names to scan.
+ Returns:
+ The list of copyrights associated with each of the files given.
+ If the certain file is generated, the corresponding list consists a single
+ entry -- 'GENERATED_FILE' string. If the file has no copyright info,
+ the corresponding list contains 'NO_COPYRIGHT' string.
+ """
+ copyrights = []
+ for file_name in files_to_scan:
+ linenum = 0
+ header = ''
+ file_copyrights = []
+ scanner = _CopyrightsScanner()
+ with open(os.path.join(root_dir, file_name), 'r') as f:
+ for l in f.readlines():
+ linenum += 1
+ if linenum <= 25:
+ header += l
+ c = scanner.MatchLine(linenum, l)
+ if c:
+ file_copyrights.append(c)
+ if _IsGeneratedFile(header):
+ copyrights.append([GENERATED_FILE])
+ elif file_copyrights:
+ copyrights.append(file_copyrights)
+ else:
+ copyrights.append([NO_COPYRIGHT])
+ return copyrights
+
+
+def FindCopyrightViolations(root_dir, files_to_scan):
+ """Looks for files that are not belong exlusively to the Chromium Authors.
+ Args:
+ root_dir: The root directory, to which all other paths are relative.
+ files_to_scan: The list of file names to scan.
+ Returns:
+ The list of file names that contain non-Chromium copyrights.
+ """
+ copyrights = FindCopyrights(root_dir, files_to_scan)
+ offending_files = []
+ allowed_copyrights_re = re.compile(
+ r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
+ 'All rights reserved.*)$')
+ for f, cs in itertools.izip(files_to_scan, copyrights):
+ if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT:
+ continue
+ for c in cs:
+ if not allowed_copyrights_re.match(c):
+ offending_files.append(os.path.normpath(f))
+ break
+ return offending_files
« no previous file with comments | « no previous file | android_webview/tools/find_copyrights.pl » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698