android_webview/tools/copyright_scanner.py - Issue 622493004: [Android WebView] Rewrite copyrights scanner in Python

Unified Diff: android_webview/tools/copyright_scanner.py

Issue 622493004: [Android WebView] Rewrite copyrights scanner in Python (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 6 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: android_webview/tools/copyright_scanner.py

diff --git a/android_webview/tools/copyright_scanner.py b/android_webview/tools/copyright_scanner.py

new file mode 100644

index 0000000000000000000000000000000000000000..c68b7507794681304ad9fb72f9e8cca1b7095bfc

--- /dev/null

+++ b/android_webview/tools/copyright_scanner.py

@@ -0,0 +1,195 @@

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+"""Utilities for scanning source files to determine code authorship.

+"""

+import itertools

+import os

+import re

+def FindFiles(root_dir, start_paths_list, excluded_dirs_list):

+ """Similar to UNIX utility find(1), searches for files in the directories.

+ Automatically leaves out only source code files.

+ Args:

+ root_dir: The root directory, to which all other paths are relative.

+ start_paths_list: The list of paths to start search from. Each path can

+ be a file or a directory.

+ excluded_dirs_list: The list of directories to skip.

+ Returns:

+ The list of source code files found, relative to |root_dir|.

+ """

+ dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]

+ def IsBlacklistedDir(d):

+ for item in dirs_blacklist:

+ if item in d:

+ return True

+ return False

+ files_whitelist_re = re.compile(

+ r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'

+ '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'

+ '|tex|mli?)$')

+ files = []

+ base_path_len = len(root_dir)

+ for path in start_paths_list:

+ full_path = os.path.join(root_dir, path)

+ if os.path.isfile(full_path):

+ if files_whitelist_re.search(path):

+ files.append(path)

+ else:

+ for dirpath, dirnames, filenames in os.walk(full_path):

+ # Remove excluded subdirs for faster scanning.

+ for item in dirnames[:]:

+ if IsBlacklistedDir(os.path.join(dirpath, item)[base_path_len + 1:]):

+ dirnames.remove(item)

mkosiba (inactive) 2014/10/02 13:14:46 the os.walk docs say:

mnaganov (inactive) 2014/10/02 13:51:30 Yeah, but the same doc also lists the defaults for

+ for filename in filenames:

+ filepath = os.path.join(dirpath, filename)[base_path_len + 1:]

+ if files_whitelist_re.search(filepath) and \

+ not IsBlacklistedDir(filepath):

+ files.append(filepath)

+ return files

+python_multiline_string_double_re = re.compile(

+ r'"""[^"]*(?:"""|$)', flags=re.MULTILINE)

+python_multiline_string_single_re = re.compile(

+ r"'''[^']*(?:'''|$)", flags=re.MULTILINE)

+automatically_generated_re = re.compile(

+ r'(All changes made in this file will be lost'

+ '|DO NOT (EDIT|delete this file)'

+ '|Generated (at|automatically|data)'

+ '|Automatically generated'

+ '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=re.IGNORECASE)

+def _IsGeneratedFile(header):

+ header = header.upper()

+ if '"""' in header:

+ header = python_multiline_string_double_re.sub('', header)

+ if "'''" in header:

+ header = python_multiline_string_single_re.sub('', header)

+ # First do simple strings lookup to save time.

+ if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:

+ return True

+ if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \

+ 'GENERATED' in header:

+ return automatically_generated_re.search(header)

+ return False

+GENERATED_FILE = 'GENERATED FILE'

+NO_COPYRIGHT = '*No copyright*'

+class _CopyrightsScanner(object):

+ _c_comment_re = re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')

+ _copyright_indicator = r'(?:copyright|copr\.|\xc2\xa9|$c$)'

+ _full_copyright_indicator_re = \

+ re.compile(r'(?:\W|^)' + _copyright_indicator + r'(?::\s*|\s+)(\w.*)$', \

+ re.IGNORECASE)

+ _copyright_disindicator_re = \

+ re.compile(r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', re.IGNORECASE)

+ def __init__(self):

+ self.max_line_numbers_proximity = 3

+ self.last_a_item_line_number = -200

+ self.last_b_item_line_number = -100

+ def _CloseLineNumbers(self, a, b):

+ return 0 <= a - b <= self.max_line_numbers_proximity

+ def MatchLine(self, line_number, line):

+ if '"' in line:

+ line = _CopyrightsScanner._c_comment_re.sub('', line)

+ upcase_line = line.upper()

+ # Record '(a)' and '(b)' last occurences in C++ comments.

mkosiba (inactive) 2014/10/02 13:14:46 Maybe move/copy the explanation from line 121 to h

mnaganov (inactive) 2014/10/02 13:51:30 Done.

+ cpp_comment_idx = upcase_line.find('//')

+ if not cpp_comment_idx == -1:

mkosiba (inactive) 2014/10/02 13:14:47 cpp_comment_idx != -1 or even '//' in upcase_line

mnaganov (inactive) 2014/10/02 13:51:30 We use the value of cpp_comment_idx if it's not -1

+ if upcase_line.find('(A)') > cpp_comment_idx:

+ self.last_a_item_line_number = line_number

+ if upcase_line.find('(B)') > cpp_comment_idx:

+ self.last_b_item_line_number = line_number

+ # Fast bailout, uses the same patterns as _copyright_indicator regexp.

+ if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \

+ and not '\xc2\xa9' in upcase_line:

+ c_item_index = upcase_line.find('(C)')

+ if c_item_index == -1:

+ return None

+ # Filter out 'c' used as a list item inside C++ comments.

+ # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"

+ if c_item_index > cpp_comment_idx and \

+ self._CloseLineNumbers(line_number,

+ self.last_b_item_line_number) and \

+ self._CloseLineNumbers(self.last_b_item_line_number,

+ self.last_a_item_line_number):

+ return None

+ copyr = None

+ m = _CopyrightsScanner._full_copyright_indicator_re.search(line)

+ if m and \

+ not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):

+ copyr = m.group(0)

+ # Prettify the authorship string.

+ copyr = re.sub(r'([,.])?\s*$/', '', copyr)

+ copyr = re.sub(self._copyright_indicator, '', copyr, flags=re.IGNORECASE)

+ copyr = re.sub(r'^\s+', '', copyr)

+ copyr = re.sub(r'\s{2,}', ' ', copyr)

+ copyr = re.sub(r'\\@', '@', copyr)

+ return copyr

+def FindCopyrights(root_dir, files_to_scan):

+ """Determines code autorship, and finds generated files.

+ Args:

+ root_dir: The root directory, to which all other paths are relative.

+ files_to_scan: The list of file names to scan.

+ Returns:

+ The list of copyrights associated with each of the files given.

+ If the certain file is generated, the corresponding list consists a single

+ entry -- 'GENERATED_FILE' string. If the file has no copyright info,

+ the corresponding list contains 'NO_COPYRIGHT' string.

+ """

+ copyrights = []

+ for f in files_to_scan:

+ linenum = 0

+ header = ''

+ file_copyrights = []

+ scanner = _CopyrightsScanner()

+ for l in open(os.path.join(root_dir, f), 'r').readlines():

mkosiba (inactive) 2014/10/02 13:14:46 umm.. you might run out of fd's if you don't close

mnaganov (inactive) 2014/10/02 13:51:30 That's a really good catch, thanks! Fixed here and

+ linenum += 1

+ if linenum <= 25:

+ header += l

+ c = scanner.MatchLine(linenum, l)

+ if c:

+ file_copyrights.append(c)

+ if _IsGeneratedFile(header):

+ copyrights.append([GENERATED_FILE])

+ elif file_copyrights:

+ copyrights.append(file_copyrights)

+ else:

+ copyrights.append([NO_COPYRIGHT])

+ return copyrights

+def FindCopyrightViolations(root_dir, files_to_scan):

+ """Looks for files that are not belong exlusively to the Chromium Authors.

+ Args:

+ root_dir: The root directory, to which all other paths are relative.

+ files_to_scan: The list of file names to scan.

+ Returns:

+ The list of file names that contain non-Chromium copyrights.

+ """

+ copyrights = FindCopyrights(root_dir, files_to_scan)

+ offending_files = []

+ allowed_copyrights_re = re.compile(

+ r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '

+ for f, cs in itertools.izip(files_to_scan, copyrights):

+ if cs[0] == GENERATED_FILE or cs[0] == NO_COPYRIGHT:

+ continue

+ for c in cs:

+ if not allowed_copyrights_re.match(c):

+ offending_files.append(os.path.normpath(f))

+ break

+ return offending_files

« no previous file with comments | « no previous file | android_webview/tools/find_copyrights.pl » ('j') | no next file with comments »