Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(128)

Side by Side Diff: third_party/pylint/checkers/similar.py

Issue 739393004: Revert "Revert "pylint: upgrade to 1.3.1"" (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/depot_tools/
Patch Set: Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « third_party/pylint/checkers/raw_metrics.py ('k') | third_party/pylint/checkers/stdlib.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # pylint: disable=W0622 1 # pylint: disable=W0622
2 # Copyright (c) 2004-2006 LOGILAB S.A. (Paris, FRANCE). 2 # Copyright (c) 2004-2013 LOGILAB S.A. (Paris, FRANCE).
3 # http://www.logilab.fr/ -- mailto:contact@logilab.fr 3 # http://www.logilab.fr/ -- mailto:contact@logilab.fr
4 # 4 #
5 # This program is free software; you can redistribute it and/or modify it under 5 # This program is free software; you can redistribute it and/or modify it under
6 # the terms of the GNU General Public License as published by the Free Software 6 # the terms of the GNU General Public License as published by the Free Software
7 # Foundation; either version 2 of the License, or (at your option) any later 7 # Foundation; either version 2 of the License, or (at your option) any later
8 # version. 8 # version.
9 # 9 #
10 # This program is distributed in the hope that it will be useful, but WITHOUT 10 # This program is distributed in the hope that it will be useful, but WITHOUT
11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details 12 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details
13 # 13 #
14 # You should have received a copy of the GNU General Public License along with 14 # You should have received a copy of the GNU General Public License along with
15 # this program; if not, write to the Free Software Foundation, Inc., 15 # this program; if not, write to the Free Software Foundation, Inc.,
16 # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 """a similarities / code duplication command line tool and pylint checker 17 """a similarities / code duplication command line tool and pylint checker
18 """ 18 """
19 from __future__ import generators
20
21 import sys 19 import sys
22 from itertools import izip 20 from itertools import izip
23 21
24 from logilab.common.ureports import Table 22 from logilab.common.ureports import Table
25 23
26 from pylint.interfaces import IRawChecker 24 from pylint.interfaces import IRawChecker
27 from pylint.checkers import BaseChecker, table_lines_from_stats 25 from pylint.checkers import BaseChecker, table_lines_from_stats
28 26
29 27
30 class Similar: 28 class Similar(object):
31 """finds copy-pasted lines of code in a project""" 29 """finds copy-pasted lines of code in a project"""
32 30
33 def __init__(self, min_lines=4, ignore_comments=False, 31 def __init__(self, min_lines=4, ignore_comments=False,
34 ignore_docstrings=False): 32 ignore_docstrings=False, ignore_imports=False):
35 self.min_lines = min_lines 33 self.min_lines = min_lines
36 self.ignore_comments = ignore_comments 34 self.ignore_comments = ignore_comments
37 self.ignore_docstrings = ignore_docstrings 35 self.ignore_docstrings = ignore_docstrings
36 self.ignore_imports = ignore_imports
38 self.linesets = [] 37 self.linesets = []
39 38
40 def append_stream(self, streamid, stream): 39 def append_stream(self, streamid, stream, encoding=None):
41 """append a file to search for similarities""" 40 """append a file to search for similarities"""
42 stream.seek(0) # XXX may be removed with astng > 0.23 41 stream.seek(0) # XXX may be removed with astroid > 0.23
43 self.linesets.append(LineSet(streamid, 42 if encoding is None:
44 stream.readlines(), 43 readlines = stream.readlines
45 self.ignore_comments, 44 else:
46 self.ignore_docstrings)) 45 readlines = lambda: [line.decode(encoding) for line in stream]
46 try:
47 self.linesets.append(LineSet(streamid,
48 readlines(),
49 self.ignore_comments,
50 self.ignore_docstrings,
51 self.ignore_imports))
52 except UnicodeDecodeError:
53 pass
47 54
48 def run(self): 55 def run(self):
49 """start looking for similarities and display results on stdout""" 56 """start looking for similarities and display results on stdout"""
50 self._display_sims(self._compute_sims()) 57 self._display_sims(self._compute_sims())
51 58
52 def _compute_sims(self): 59 def _compute_sims(self):
53 """compute similarities in appended files""" 60 """compute similarities in appended files"""
54 no_duplicates = {} 61 no_duplicates = {}
55 for num, lineset1, idx1, lineset2, idx2 in self._iter_sims(): 62 for num, lineset1, idx1, lineset2, idx2 in self._iter_sims():
56 duplicate = no_duplicates.setdefault(num, []) 63 duplicate = no_duplicates.setdefault(num, [])
57 for couples in duplicate: 64 for couples in duplicate:
58 if (lineset1, idx1) in couples or (lineset2, idx2) in couples: 65 if (lineset1, idx1) in couples or (lineset2, idx2) in couples:
59 couples.add( (lineset1, idx1) ) 66 couples.add((lineset1, idx1))
60 couples.add( (lineset2, idx2) ) 67 couples.add((lineset2, idx2))
61 break 68 break
62 else: 69 else:
63 duplicate.append( set([(lineset1, idx1), (lineset2, idx2)]) ) 70 duplicate.append(set([(lineset1, idx1), (lineset2, idx2)]))
64 sims = [] 71 sims = []
65 for num, ensembles in no_duplicates.iteritems(): 72 for num, ensembles in no_duplicates.iteritems():
66 for couples in ensembles: 73 for couples in ensembles:
67 sims.append( (num, couples) ) 74 sims.append((num, couples))
68 sims.sort() 75 sims.sort()
69 sims.reverse() 76 sims.reverse()
70 return sims 77 return sims
71 78
72 def _display_sims(self, sims): 79 def _display_sims(self, sims):
73 """display computed similarities on stdout""" 80 """display computed similarities on stdout"""
74 nb_lignes_dupliquees = 0 81 nb_lignes_dupliquees = 0
75 for num, couples in sims: 82 for num, couples in sims:
76 print 83 print
77 print num, "similar lines in", len(couples), "files" 84 print num, "similar lines in", len(couples), "files"
78 couples = sorted(couples) 85 couples = sorted(couples)
79 for lineset, idx in couples: 86 for lineset, idx in couples:
80 print "==%s:%s" % (lineset.name, idx) 87 print "==%s:%s" % (lineset.name, idx)
81 # pylint: disable=W0631 88 # pylint: disable=W0631
82 for line in lineset._real_lines[idx:idx+num]: 89 for line in lineset._real_lines[idx:idx+num]:
83 print " ", line, 90 print " ", line.rstrip()
84 nb_lignes_dupliquees += num * (len(couples)-1) 91 nb_lignes_dupliquees += num * (len(couples)-1)
85 nb_total_lignes = sum([len(lineset) for lineset in self.linesets]) 92 nb_total_lignes = sum([len(lineset) for lineset in self.linesets])
86 print "TOTAL lines=%s duplicates=%s percent=%.2f" \ 93 print "TOTAL lines=%s duplicates=%s percent=%.2f" \
87 % (nb_total_lignes, nb_lignes_dupliquees, 94 % (nb_total_lignes, nb_lignes_dupliquees,
88 nb_lignes_dupliquees*100. / nb_total_lignes) 95 nb_lignes_dupliquees*100. / nb_total_lignes)
89 96
90 def _find_common(self, lineset1, lineset2): 97 def _find_common(self, lineset1, lineset2):
91 """find similarities in the two given linesets""" 98 """find similarities in the two given linesets"""
92 lines1 = lineset1.enumerate_stripped 99 lines1 = lineset1.enumerate_stripped
93 lines2 = lineset2.enumerate_stripped 100 lines2 = lineset2.enumerate_stripped
94 find = lineset2.find 101 find = lineset2.find
95 index1 = 0 102 index1 = 0
96 min_lines = self.min_lines 103 min_lines = self.min_lines
97 while index1 < len(lineset1): 104 while index1 < len(lineset1):
98 skip = 1 105 skip = 1
99 num = 0 106 num = 0
100 for index2 in find( lineset1[index1] ): 107 for index2 in find(lineset1[index1]):
101 non_blank = 0 108 non_blank = 0
102 for num, ((_, line1), (_, line2)) in enumerate( 109 for num, ((_, line1), (_, line2)) in enumerate(
103 izip(lines1(index1), lines2(index2))): 110 izip(lines1(index1), lines2(index2))):
104 if line1 != line2: 111 if line1 != line2:
105 if non_blank > min_lines: 112 if non_blank > min_lines:
106 yield num, lineset1, index1, lineset2, index2 113 yield num, lineset1, index1, lineset2, index2
107 skip = max(skip, num) 114 skip = max(skip, num)
108 break 115 break
109 if line1: 116 if line1:
110 non_blank += 1 117 non_blank += 1
111 else: 118 else:
112 # we may have reach the end 119 # we may have reach the end
113 num += 1 120 num += 1
114 if non_blank > min_lines: 121 if non_blank > min_lines:
115 yield num, lineset1, index1, lineset2, index2 122 yield num, lineset1, index1, lineset2, index2
116 skip = max(skip, num) 123 skip = max(skip, num)
117 index1 += skip 124 index1 += skip
118 125
119 def _iter_sims(self): 126 def _iter_sims(self):
120 """iterate on similarities among all files, by making a cartesian 127 """iterate on similarities among all files, by making a cartesian
121 product 128 product
122 """ 129 """
123 for idx, lineset in enumerate(self.linesets[:-1]): 130 for idx, lineset in enumerate(self.linesets[:-1]):
124 for lineset2 in self.linesets[idx+1:]: 131 for lineset2 in self.linesets[idx+1:]:
125 for sim in self._find_common(lineset, lineset2): 132 for sim in self._find_common(lineset, lineset2):
126 yield sim 133 yield sim
127 134
128 def stripped_lines(lines, ignore_comments, ignore_docstrings): 135 def stripped_lines(lines, ignore_comments, ignore_docstrings, ignore_imports):
136 """return lines with leading/trailing whitespace and any ignored code
137 features removed
138 """
139
129 strippedlines = [] 140 strippedlines = []
130 docstring = None 141 docstring = None
131 for line in lines: 142 for line in lines:
132 line = line.strip() 143 line = line.strip()
133 if ignore_docstrings: 144 if ignore_docstrings:
134 if not docstring and \ 145 if not docstring and \
135 (line.startswith('"""') or line.startswith("'''")): 146 (line.startswith('"""') or line.startswith("'''")):
136 docstring = line[:3] 147 docstring = line[:3]
137 line = line[3:] 148 line = line[3:]
138 if docstring: 149 if docstring:
139 if line.endswith(docstring): 150 if line.endswith(docstring):
140 docstring = None 151 docstring = None
141 line = '' 152 line = ''
153 if ignore_imports:
154 if line.startswith("import ") or line.startswith("from "):
155 line = ''
142 if ignore_comments: 156 if ignore_comments:
143 # XXX should use regex in checkers/format to avoid cutting 157 # XXX should use regex in checkers/format to avoid cutting
144 # at a "#" in a string 158 # at a "#" in a string
145 line = line.split('#', 1)[0].strip() 159 line = line.split('#', 1)[0].strip()
146 strippedlines.append(line) 160 strippedlines.append(line)
147 return strippedlines 161 return strippedlines
148 162
149 class LineSet: 163
164 class LineSet(object):
150 """Holds and indexes all the lines of a single source file""" 165 """Holds and indexes all the lines of a single source file"""
151 def __init__(self, name, lines, ignore_comments=False, 166 def __init__(self, name, lines, ignore_comments=False,
152 ignore_docstrings=False): 167 ignore_docstrings=False, ignore_imports=False):
153 self.name = name 168 self.name = name
154 self._real_lines = lines 169 self._real_lines = lines
155 self._stripped_lines = stripped_lines(lines, ignore_comments, 170 self._stripped_lines = stripped_lines(lines, ignore_comments,
156 ignore_docstrings) 171 ignore_docstrings,
172 ignore_imports)
157 self._index = self._mk_index() 173 self._index = self._mk_index()
158 174
159 def __str__(self): 175 def __str__(self):
160 return '<Lineset for %s>' % self.name 176 return '<Lineset for %s>' % self.name
161 177
162 def __len__(self): 178 def __len__(self):
163 return len(self._real_lines) 179 return len(self._real_lines)
164 180
165 def __getitem__(self, index): 181 def __getitem__(self, index):
166 return self._stripped_lines[index] 182 return self._stripped_lines[index]
(...skipping 20 matching lines...) Expand all
187 203
188 def find(self, stripped_line): 204 def find(self, stripped_line):
189 """return positions of the given stripped line in this set""" 205 """return positions of the given stripped line in this set"""
190 return self._index.get(stripped_line, ()) 206 return self._index.get(stripped_line, ())
191 207
192 def _mk_index(self): 208 def _mk_index(self):
193 """create the index for this set""" 209 """create the index for this set"""
194 index = {} 210 index = {}
195 for line_no, line in enumerate(self._stripped_lines): 211 for line_no, line in enumerate(self._stripped_lines):
196 if line: 212 if line:
197 index.setdefault(line, []).append( line_no ) 213 index.setdefault(line, []).append(line_no)
198 return index 214 return index
199 215
200 216
201 MSGS = {'R0801': ('Similar lines in %s files\n%s', 217 MSGS = {'R0801': ('Similar lines in %s files\n%s',
218 'duplicate-code',
202 'Indicates that a set of similar lines has been detected \ 219 'Indicates that a set of similar lines has been detected \
203 among multiple file. This usually means that the code should \ 220 among multiple file. This usually means that the code should \
204 be refactored to avoid this duplication.')} 221 be refactored to avoid this duplication.')}
205 222
206 def report_similarities(sect, stats, old_stats): 223 def report_similarities(sect, stats, old_stats):
207 """make a layout with some stats about duplication""" 224 """make a layout with some stats about duplication"""
208 lines = ['', 'now', 'previous', 'difference'] 225 lines = ['', 'now', 'previous', 'difference']
209 lines += table_lines_from_stats(stats, old_stats, 226 lines += table_lines_from_stats(stats, old_stats,
210 ('nb_duplicated_lines', 227 ('nb_duplicated_lines',
211 'percent_duplicated_lines')) 228 'percent_duplicated_lines'))
(...skipping 13 matching lines...) Expand all
225 # messages 242 # messages
226 msgs = MSGS 243 msgs = MSGS
227 # configuration options 244 # configuration options
228 # for available dict keys/values see the optik parser 'add_option' method 245 # for available dict keys/values see the optik parser 'add_option' method
229 options = (('min-similarity-lines', 246 options = (('min-similarity-lines',
230 {'default' : 4, 'type' : "int", 'metavar' : '<int>', 247 {'default' : 4, 'type' : "int", 'metavar' : '<int>',
231 'help' : 'Minimum lines number of a similarity.'}), 248 'help' : 'Minimum lines number of a similarity.'}),
232 ('ignore-comments', 249 ('ignore-comments',
233 {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>', 250 {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
234 'help': 'Ignore comments when computing similarities.'} 251 'help': 'Ignore comments when computing similarities.'}
235 ), 252 ),
236 ('ignore-docstrings', 253 ('ignore-docstrings',
237 {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>', 254 {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
238 'help': 'Ignore docstrings when computing similarities.'} 255 'help': 'Ignore docstrings when computing similarities.'}
239 ), 256 ),
240 ) 257 ('ignore-imports',
258 {'default' : False, 'type' : 'yn', 'metavar' : '<y or n>',
259 'help': 'Ignore imports when computing similarities.'}
260 ),
261 )
241 # reports 262 # reports
242 reports = ( ('R0801', 'Duplication', report_similarities), ) # XXX actually a Refactoring message 263 reports = (('RP0801', 'Duplication', report_similarities),)
243 264
244 def __init__(self, linter=None): 265 def __init__(self, linter=None):
245 BaseChecker.__init__(self, linter) 266 BaseChecker.__init__(self, linter)
246 Similar.__init__(self, min_lines=4, 267 Similar.__init__(self, min_lines=4,
247 ignore_comments=True, ignore_docstrings=True) 268 ignore_comments=True, ignore_docstrings=True)
248 self.stats = None 269 self.stats = None
249 270
250 def set_option(self, optname, value, action=None, optdict=None): 271 def set_option(self, optname, value, action=None, optdict=None):
251 """method called to set an option (registered in the options list) 272 """method called to set an option (registered in the options list)
252 273
253 overridden to report options setting to Similar 274 overridden to report options setting to Similar
254 """ 275 """
255 BaseChecker.set_option(self, optname, value, action, optdict) 276 BaseChecker.set_option(self, optname, value, action, optdict)
256 if optname == 'min-similarity-lines': 277 if optname == 'min-similarity-lines':
257 self.min_lines = self.config.min_similarity_lines 278 self.min_lines = self.config.min_similarity_lines
258 elif optname == 'ignore-comments': 279 elif optname == 'ignore-comments':
259 self.ignore_comments = self.config.ignore_comments 280 self.ignore_comments = self.config.ignore_comments
260 elif optname == 'ignore-docstrings': 281 elif optname == 'ignore-docstrings':
261 self.ignore_docstrings = self.config.ignore_docstrings 282 self.ignore_docstrings = self.config.ignore_docstrings
283 elif optname == 'ignore-imports':
284 self.ignore_imports = self.config.ignore_imports
262 285
263 def open(self): 286 def open(self):
264 """init the checkers: reset linesets and statistics information""" 287 """init the checkers: reset linesets and statistics information"""
265 self.linesets = [] 288 self.linesets = []
266 self.stats = self.linter.add_stats(nb_duplicated_lines=0, 289 self.stats = self.linter.add_stats(nb_duplicated_lines=0,
267 percent_duplicated_lines=0) 290 percent_duplicated_lines=0)
268 291
269 def process_module(self, node): 292 def process_module(self, node):
270 """process a module 293 """process a module
271 294
272 the module's content is accessible via the stream object 295 the module's content is accessible via the stream object
273 296
274 stream must implement the readlines method 297 stream must implement the readlines method
275 """ 298 """
276 self.append_stream(self.linter.current_name, node.file_stream) 299 self.append_stream(self.linter.current_name, node.file_stream, node.file _encoding)
277 300
278 def close(self): 301 def close(self):
279 """compute and display similarities on closing (i.e. end of parsing)""" 302 """compute and display similarities on closing (i.e. end of parsing)"""
280 total = sum([len(lineset) for lineset in self.linesets]) 303 total = sum([len(lineset) for lineset in self.linesets])
281 duplicated = 0 304 duplicated = 0
282 stats = self.stats 305 stats = self.stats
283 for num, couples in self._compute_sims(): 306 for num, couples in self._compute_sims():
284 msg = [] 307 msg = []
285 for lineset, idx in couples: 308 for lineset, idx in couples:
286 msg.append("==%s:%s" % (lineset.name, idx)) 309 msg.append("==%s:%s" % (lineset.name, idx))
287 msg.sort() 310 msg.sort()
288 # pylint: disable=W0631 311 # pylint: disable=W0631
289 for line in lineset._real_lines[idx:idx+num]: 312 for line in lineset._real_lines[idx:idx+num]:
290 msg.append(line.rstrip()) 313 msg.append(line.rstrip())
291 self.add_message('R0801', args=(len(couples), '\n'.join(msg))) 314 self.add_message('R0801', args=(len(couples), '\n'.join(msg)))
292 duplicated += num * (len(couples) - 1) 315 duplicated += num * (len(couples) - 1)
293 stats['nb_duplicated_lines'] = duplicated 316 stats['nb_duplicated_lines'] = duplicated
294 stats['percent_duplicated_lines'] = total and duplicated * 100. / total 317 stats['percent_duplicated_lines'] = total and duplicated * 100. / total
295 318
296 319
297 def register(linter): 320 def register(linter):
298 """required method to auto register this checker """ 321 """required method to auto register this checker """
299 linter.register_checker(SimilarChecker(linter)) 322 linter.register_checker(SimilarChecker(linter))
300 323
301 def usage(status=0): 324 def usage(status=0):
302 """display command line usage information""" 325 """display command line usage information"""
303 print "finds copy pasted blocks in a set of files" 326 print "finds copy pasted blocks in a set of files"
304 print 327 print
305 print 'Usage: symilar [-d|--duplicates min_duplicated_lines] \ 328 print 'Usage: symilar [-d|--duplicates min_duplicated_lines] \
306 [-i|--ignore-comments] file1...' 329 [-i|--ignore-comments] [--ignore-docstrings] [--ignore-imports] file1...'
307 sys.exit(status) 330 sys.exit(status)
308 331
309 def run(argv=None): 332 def Run(argv=None):
310 """standalone command line access point""" 333 """standalone command line access point"""
311 if argv is None: 334 if argv is None:
312 argv = sys.argv[1:] 335 argv = sys.argv[1:]
313 from getopt import getopt 336 from getopt import getopt
314 s_opts = 'hdi' 337 s_opts = 'hdi'
315 l_opts = ('help', 'duplicates=', 'ignore-comments') 338 l_opts = ('help', 'duplicates=', 'ignore-comments', 'ignore-imports',
339 'ignore-docstrings')
316 min_lines = 4 340 min_lines = 4
317 ignore_comments = False 341 ignore_comments = False
342 ignore_docstrings = False
343 ignore_imports = False
318 opts, args = getopt(argv, s_opts, l_opts) 344 opts, args = getopt(argv, s_opts, l_opts)
319 for opt, val in opts: 345 for opt, val in opts:
320 if opt in ('-d', '--duplicates'): 346 if opt in ('-d', '--duplicates'):
321 min_lines = int(val) 347 min_lines = int(val)
322 elif opt in ('-h', '--help'): 348 elif opt in ('-h', '--help'):
323 usage() 349 usage()
324 elif opt in ('-i', '--ignore-comments'): 350 elif opt in ('-i', '--ignore-comments'):
325 ignore_comments = True 351 ignore_comments = True
352 elif opt in ('--ignore-docstrings',):
353 ignore_docstrings = True
354 elif opt in ('--ignore-imports',):
355 ignore_imports = True
326 if not args: 356 if not args:
327 usage(1) 357 usage(1)
328 sim = Similar(min_lines, ignore_comments) 358 sim = Similar(min_lines, ignore_comments, ignore_docstrings, ignore_imports)
329 for filename in args: 359 for filename in args:
330 sim.append_stream(filename, open(filename)) 360 sim.append_stream(filename, open(filename))
331 sim.run() 361 sim.run()
362 sys.exit(0)
332 363
333 if __name__ == '__main__': 364 if __name__ == '__main__':
334 run() 365 Run()
OLDNEW
« no previous file with comments | « third_party/pylint/checkers/raw_metrics.py ('k') | third_party/pylint/checkers/stdlib.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698