OLD | NEW |
1 # pylint: disable=W0622 | 1 # pylint: disable=W0622 |
2 # Copyright (c) 2004-2013 LOGILAB S.A. (Paris, FRANCE). | 2 # Copyright (c) 2004-2006 LOGILAB S.A. (Paris, FRANCE). |
3 # http://www.logilab.fr/ -- mailto:contact@logilab.fr | 3 # http://www.logilab.fr/ -- mailto:contact@logilab.fr |
4 # | 4 # |
5 # This program is free software; you can redistribute it and/or modify it under | 5 # This program is free software; you can redistribute it and/or modify it under |
6 # the terms of the GNU General Public License as published by the Free Software | 6 # the terms of the GNU General Public License as published by the Free Software |
7 # Foundation; either version 2 of the License, or (at your option) any later | 7 # Foundation; either version 2 of the License, or (at your option) any later |
8 # version. | 8 # version. |
9 # | 9 # |
10 # This program is distributed in the hope that it will be useful, but WITHOUT | 10 # This program is distributed in the hope that it will be useful, but WITHOUT |
11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details | 12 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details |
13 # | 13 # |
14 # You should have received a copy of the GNU General Public License along with | 14 # You should have received a copy of the GNU General Public License along with |
15 # this program; if not, write to the Free Software Foundation, Inc., | 15 # this program; if not, write to the Free Software Foundation, Inc., |
16 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | 16 # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
17 """a similarities / code duplication command line tool and pylint checker | 17 """a similarities / code duplication command line tool and pylint checker |
18 """ | 18 """ |
| 19 from __future__ import generators |
| 20 |
19 import sys | 21 import sys |
20 from itertools import izip | 22 from itertools import izip |
21 | 23 |
22 from logilab.common.ureports import Table | 24 from logilab.common.ureports import Table |
23 | 25 |
24 from pylint.interfaces import IRawChecker | 26 from pylint.interfaces import IRawChecker |
25 from pylint.checkers import BaseChecker, table_lines_from_stats | 27 from pylint.checkers import BaseChecker, table_lines_from_stats |
26 | 28 |
27 | 29 |
28 class Similar(object): | 30 class Similar: |
29 """finds copy-pasted lines of code in a project""" | 31 """finds copy-pasted lines of code in a project""" |
30 | 32 |
31 def __init__(self, min_lines=4, ignore_comments=False, | 33 def __init__(self, min_lines=4, ignore_comments=False, |
32 ignore_docstrings=False, ignore_imports=False): | 34 ignore_docstrings=False): |
33 self.min_lines = min_lines | 35 self.min_lines = min_lines |
34 self.ignore_comments = ignore_comments | 36 self.ignore_comments = ignore_comments |
35 self.ignore_docstrings = ignore_docstrings | 37 self.ignore_docstrings = ignore_docstrings |
36 self.ignore_imports = ignore_imports | |
37 self.linesets = [] | 38 self.linesets = [] |
38 | 39 |
39 def append_stream(self, streamid, stream, encoding=None): | 40 def append_stream(self, streamid, stream): |
40 """append a file to search for similarities""" | 41 """append a file to search for similarities""" |
41 stream.seek(0) # XXX may be removed with astroid > 0.23 | 42 stream.seek(0) # XXX may be removed with astng > 0.23 |
42 if encoding is None: | 43 self.linesets.append(LineSet(streamid, |
43 readlines = stream.readlines | 44 stream.readlines(), |
44 else: | 45 self.ignore_comments, |
45 readlines = lambda: [line.decode(encoding) for line in stream] | 46 self.ignore_docstrings)) |
46 try: | |
47 self.linesets.append(LineSet(streamid, | |
48 readlines(), | |
49 self.ignore_comments, | |
50 self.ignore_docstrings, | |
51 self.ignore_imports)) | |
52 except UnicodeDecodeError: | |
53 pass | |
54 | 47 |
55 def run(self): | 48 def run(self): |
56 """start looking for similarities and display results on stdout""" | 49 """start looking for similarities and display results on stdout""" |
57 self._display_sims(self._compute_sims()) | 50 self._display_sims(self._compute_sims()) |
58 | 51 |
59 def _compute_sims(self): | 52 def _compute_sims(self): |
60 """compute similarities in appended files""" | 53 """compute similarities in appended files""" |
61 no_duplicates = {} | 54 no_duplicates = {} |
62 for num, lineset1, idx1, lineset2, idx2 in self._iter_sims(): | 55 for num, lineset1, idx1, lineset2, idx2 in self._iter_sims(): |
63 duplicate = no_duplicates.setdefault(num, []) | 56 duplicate = no_duplicates.setdefault(num, []) |
64 for couples in duplicate: | 57 for couples in duplicate: |
65 if (lineset1, idx1) in couples or (lineset2, idx2) in couples: | 58 if (lineset1, idx1) in couples or (lineset2, idx2) in couples: |
66 couples.add((lineset1, idx1)) | 59 couples.add( (lineset1, idx1) ) |
67 couples.add((lineset2, idx2)) | 60 couples.add( (lineset2, idx2) ) |
68 break | 61 break |
69 else: | 62 else: |
70 duplicate.append(set([(lineset1, idx1), (lineset2, idx2)])) | 63 duplicate.append( set([(lineset1, idx1), (lineset2, idx2)]) ) |
71 sims = [] | 64 sims = [] |
72 for num, ensembles in no_duplicates.iteritems(): | 65 for num, ensembles in no_duplicates.iteritems(): |
73 for couples in ensembles: | 66 for couples in ensembles: |
74 sims.append((num, couples)) | 67 sims.append( (num, couples) ) |
75 sims.sort() | 68 sims.sort() |
76 sims.reverse() | 69 sims.reverse() |
77 return sims | 70 return sims |
78 | 71 |
79 def _display_sims(self, sims): | 72 def _display_sims(self, sims): |
80 """display computed similarities on stdout""" | 73 """display computed similarities on stdout""" |
81 nb_lignes_dupliquees = 0 | 74 nb_lignes_dupliquees = 0 |
82 for num, couples in sims: | 75 for num, couples in sims: |
83 print | 76 print |
84 print num, "similar lines in", len(couples), "files" | 77 print num, "similar lines in", len(couples), "files" |
85 couples = sorted(couples) | 78 couples = sorted(couples) |
86 for lineset, idx in couples: | 79 for lineset, idx in couples: |
87 print "==%s:%s" % (lineset.name, idx) | 80 print "==%s:%s" % (lineset.name, idx) |
88 # pylint: disable=W0631 | 81 # pylint: disable=W0631 |
89 for line in lineset._real_lines[idx:idx+num]: | 82 for line in lineset._real_lines[idx:idx+num]: |
90 print " ", line.rstrip() | 83 print " ", line, |
91 nb_lignes_dupliquees += num * (len(couples)-1) | 84 nb_lignes_dupliquees += num * (len(couples)-1) |
92 nb_total_lignes = sum([len(lineset) for lineset in self.linesets]) | 85 nb_total_lignes = sum([len(lineset) for lineset in self.linesets]) |
93 print "TOTAL lines=%s duplicates=%s percent=%.2f" \ | 86 print "TOTAL lines=%s duplicates=%s percent=%.2f" \ |
94 % (nb_total_lignes, nb_lignes_dupliquees, | 87 % (nb_total_lignes, nb_lignes_dupliquees, |
95 nb_lignes_dupliquees*100. / nb_total_lignes) | 88 nb_lignes_dupliquees*100. / nb_total_lignes) |
96 | 89 |
97 def _find_common(self, lineset1, lineset2): | 90 def _find_common(self, lineset1, lineset2): |
98 """find similarities in the two given linesets""" | 91 """find similarities in the two given linesets""" |
99 lines1 = lineset1.enumerate_stripped | 92 lines1 = lineset1.enumerate_stripped |
100 lines2 = lineset2.enumerate_stripped | 93 lines2 = lineset2.enumerate_stripped |
101 find = lineset2.find | 94 find = lineset2.find |
102 index1 = 0 | 95 index1 = 0 |
103 min_lines = self.min_lines | 96 min_lines = self.min_lines |
104 while index1 < len(lineset1): | 97 while index1 < len(lineset1): |
105 skip = 1 | 98 skip = 1 |
106 num = 0 | 99 num = 0 |
107 for index2 in find(lineset1[index1]): | 100 for index2 in find( lineset1[index1] ): |
108 non_blank = 0 | 101 non_blank = 0 |
109 for num, ((_, line1), (_, line2)) in enumerate( | 102 for num, ((_, line1), (_, line2)) in enumerate( |
110 izip(lines1(index1), lines2(index2))): | 103 izip(lines1(index1), lines2(index2))): |
111 if line1 != line2: | 104 if line1 != line2: |
112 if non_blank > min_lines: | 105 if non_blank > min_lines: |
113 yield num, lineset1, index1, lineset2, index2 | 106 yield num, lineset1, index1, lineset2, index2 |
114 skip = max(skip, num) | 107 skip = max(skip, num) |
115 break | 108 break |
116 if line1: | 109 if line1: |
117 non_blank += 1 | 110 non_blank += 1 |
118 else: | 111 else: |
119 # we may have reach the end | 112 # we may have reach the end |
120 num += 1 | 113 num += 1 |
121 if non_blank > min_lines: | 114 if non_blank > min_lines: |
122 yield num, lineset1, index1, lineset2, index2 | 115 yield num, lineset1, index1, lineset2, index2 |
123 skip = max(skip, num) | 116 skip = max(skip, num) |
124 index1 += skip | 117 index1 += skip |
125 | 118 |
126 def _iter_sims(self): | 119 def _iter_sims(self): |
127 """iterate on similarities among all files, by making a cartesian | 120 """iterate on similarities among all files, by making a cartesian |
128 product | 121 product |
129 """ | 122 """ |
130 for idx, lineset in enumerate(self.linesets[:-1]): | 123 for idx, lineset in enumerate(self.linesets[:-1]): |
131 for lineset2 in self.linesets[idx+1:]: | 124 for lineset2 in self.linesets[idx+1:]: |
132 for sim in self._find_common(lineset, lineset2): | 125 for sim in self._find_common(lineset, lineset2): |
133 yield sim | 126 yield sim |
134 | 127 |
135 def stripped_lines(lines, ignore_comments, ignore_docstrings, ignore_imports): | 128 def stripped_lines(lines, ignore_comments, ignore_docstrings): |
136 """return lines with leading/trailing whitespace and any ignored code | |
137 features removed | |
138 """ | |
139 | |
140 strippedlines = [] | 129 strippedlines = [] |
141 docstring = None | 130 docstring = None |
142 for line in lines: | 131 for line in lines: |
143 line = line.strip() | 132 line = line.strip() |
144 if ignore_docstrings: | 133 if ignore_docstrings: |
145 if not docstring and \ | 134 if not docstring and \ |
146 (line.startswith('"""') or line.startswith("'''")): | 135 (line.startswith('"""') or line.startswith("'''")): |
147 docstring = line[:3] | 136 docstring = line[:3] |
148 line = line[3:] | 137 line = line[3:] |
149 if docstring: | 138 if docstring: |
150 if line.endswith(docstring): | 139 if line.endswith(docstring): |
151 docstring = None | 140 docstring = None |
152 line = '' | 141 line = '' |
153 if ignore_imports: | |
154 if line.startswith("import ") or line.startswith("from "): | |
155 line = '' | |
156 if ignore_comments: | 142 if ignore_comments: |
157 # XXX should use regex in checkers/format to avoid cutting | 143 # XXX should use regex in checkers/format to avoid cutting |
158 # at a "#" in a string | 144 # at a "#" in a string |
159 line = line.split('#', 1)[0].strip() | 145 line = line.split('#', 1)[0].strip() |
160 strippedlines.append(line) | 146 strippedlines.append(line) |
161 return strippedlines | 147 return strippedlines |
162 | 148 |
163 | 149 class LineSet: |
164 class LineSet(object): | |
165 """Holds and indexes all the lines of a single source file""" | 150 """Holds and indexes all the lines of a single source file""" |
166 def __init__(self, name, lines, ignore_comments=False, | 151 def __init__(self, name, lines, ignore_comments=False, |
167 ignore_docstrings=False, ignore_imports=False): | 152 ignore_docstrings=False): |
168 self.name = name | 153 self.name = name |
169 self._real_lines = lines | 154 self._real_lines = lines |
170 self._stripped_lines = stripped_lines(lines, ignore_comments, | 155 self._stripped_lines = stripped_lines(lines, ignore_comments, |
171 ignore_docstrings, | 156 ignore_docstrings) |
172 ignore_imports) | |
173 self._index = self._mk_index() | 157 self._index = self._mk_index() |
174 | 158 |
175 def __str__(self): | 159 def __str__(self): |
176 return '<Lineset for %s>' % self.name | 160 return '<Lineset for %s>' % self.name |
177 | 161 |
178 def __len__(self): | 162 def __len__(self): |
179 return len(self._real_lines) | 163 return len(self._real_lines) |
180 | 164 |
181 def __getitem__(self, index): | 165 def __getitem__(self, index): |
182 return self._stripped_lines[index] | 166 return self._stripped_lines[index] |
(...skipping 20 matching lines...) Expand all Loading... |
203 | 187 |
204 def find(self, stripped_line): | 188 def find(self, stripped_line): |
205 """return positions of the given stripped line in this set""" | 189 """return positions of the given stripped line in this set""" |
206 return self._index.get(stripped_line, ()) | 190 return self._index.get(stripped_line, ()) |
207 | 191 |
208 def _mk_index(self): | 192 def _mk_index(self): |
209 """create the index for this set""" | 193 """create the index for this set""" |
210 index = {} | 194 index = {} |
211 for line_no, line in enumerate(self._stripped_lines): | 195 for line_no, line in enumerate(self._stripped_lines): |
212 if line: | 196 if line: |
213 index.setdefault(line, []).append(line_no) | 197 index.setdefault(line, []).append( line_no ) |
214 return index | 198 return index |
215 | 199 |
216 | 200 |
217 MSGS = {'R0801': ('Similar lines in %s files\n%s', | 201 MSGS = {'R0801': ('Similar lines in %s files\n%s', |
218 'duplicate-code', | |
219 'Indicates that a set of similar lines has been detected \ | 202 'Indicates that a set of similar lines has been detected \ |
220 among multiple file. This usually means that the code should \ | 203 among multiple file. This usually means that the code should \ |
221 be refactored to avoid this duplication.')} | 204 be refactored to avoid this duplication.')} |
222 | 205 |
223 def report_similarities(sect, stats, old_stats): | 206 def report_similarities(sect, stats, old_stats): |
224 """make a layout with some stats about duplication""" | 207 """make a layout with some stats about duplication""" |
225 lines = ['', 'now', 'previous', 'difference'] | 208 lines = ['', 'now', 'previous', 'difference'] |
226 lines += table_lines_from_stats(stats, old_stats, | 209 lines += table_lines_from_stats(stats, old_stats, |
227 ('nb_duplicated_lines', | 210 ('nb_duplicated_lines', |
228 'percent_duplicated_lines')) | 211 'percent_duplicated_lines')) |
(...skipping 13 matching lines...) Expand all Loading... |
242 # messages | 225 # messages |
243 msgs = MSGS | 226 msgs = MSGS |
244 # configuration options | 227 # configuration options |
245 # for available dict keys/values see the optik parser 'add_option' method | 228 # for available dict keys/values see the optik parser 'add_option' method |
246 options = (('min-similarity-lines', | 229 options = (('min-similarity-lines', |
247 {'default' : 4, 'type' : "int", 'metavar' : '<int>', | 230 {'default' : 4, 'type' : "int", 'metavar' : '<int>', |
248 'help' : 'Minimum lines number of a similarity.'}), | 231 'help' : 'Minimum lines number of a similarity.'}), |
249 ('ignore-comments', | 232 ('ignore-comments', |
250 {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>', | 233 {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>', |
251 'help': 'Ignore comments when computing similarities.'} | 234 'help': 'Ignore comments when computing similarities.'} |
252 ), | 235 ), |
253 ('ignore-docstrings', | 236 ('ignore-docstrings', |
254 {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>', | 237 {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>', |
255 'help': 'Ignore docstrings when computing similarities.'} | 238 'help': 'Ignore docstrings when computing similarities.'} |
256 ), | 239 ), |
257 ('ignore-imports', | 240 ) |
258 {'default' : False, 'type' : 'yn', 'metavar' : '<y or n>', | |
259 'help': 'Ignore imports when computing similarities.'} | |
260 ), | |
261 ) | |
262 # reports | 241 # reports |
263 reports = (('RP0801', 'Duplication', report_similarities),) | 242 reports = ( ('R0801', 'Duplication', report_similarities), ) # XXX actually
a Refactoring message |
264 | 243 |
265 def __init__(self, linter=None): | 244 def __init__(self, linter=None): |
266 BaseChecker.__init__(self, linter) | 245 BaseChecker.__init__(self, linter) |
267 Similar.__init__(self, min_lines=4, | 246 Similar.__init__(self, min_lines=4, |
268 ignore_comments=True, ignore_docstrings=True) | 247 ignore_comments=True, ignore_docstrings=True) |
269 self.stats = None | 248 self.stats = None |
270 | 249 |
271 def set_option(self, optname, value, action=None, optdict=None): | 250 def set_option(self, optname, value, action=None, optdict=None): |
272 """method called to set an option (registered in the options list) | 251 """method called to set an option (registered in the options list) |
273 | 252 |
274 overridden to report options setting to Similar | 253 overridden to report options setting to Similar |
275 """ | 254 """ |
276 BaseChecker.set_option(self, optname, value, action, optdict) | 255 BaseChecker.set_option(self, optname, value, action, optdict) |
277 if optname == 'min-similarity-lines': | 256 if optname == 'min-similarity-lines': |
278 self.min_lines = self.config.min_similarity_lines | 257 self.min_lines = self.config.min_similarity_lines |
279 elif optname == 'ignore-comments': | 258 elif optname == 'ignore-comments': |
280 self.ignore_comments = self.config.ignore_comments | 259 self.ignore_comments = self.config.ignore_comments |
281 elif optname == 'ignore-docstrings': | 260 elif optname == 'ignore-docstrings': |
282 self.ignore_docstrings = self.config.ignore_docstrings | 261 self.ignore_docstrings = self.config.ignore_docstrings |
283 elif optname == 'ignore-imports': | |
284 self.ignore_imports = self.config.ignore_imports | |
285 | 262 |
286 def open(self): | 263 def open(self): |
287 """init the checkers: reset linesets and statistics information""" | 264 """init the checkers: reset linesets and statistics information""" |
288 self.linesets = [] | 265 self.linesets = [] |
289 self.stats = self.linter.add_stats(nb_duplicated_lines=0, | 266 self.stats = self.linter.add_stats(nb_duplicated_lines=0, |
290 percent_duplicated_lines=0) | 267 percent_duplicated_lines=0) |
291 | 268 |
292 def process_module(self, node): | 269 def process_module(self, node): |
293 """process a module | 270 """process a module |
294 | 271 |
295 the module's content is accessible via the stream object | 272 the module's content is accessible via the stream object |
296 | 273 |
297 stream must implement the readlines method | 274 stream must implement the readlines method |
298 """ | 275 """ |
299 self.append_stream(self.linter.current_name, node.file_stream, node.file
_encoding) | 276 self.append_stream(self.linter.current_name, node.file_stream) |
300 | 277 |
301 def close(self): | 278 def close(self): |
302 """compute and display similarities on closing (i.e. end of parsing)""" | 279 """compute and display similarities on closing (i.e. end of parsing)""" |
303 total = sum([len(lineset) for lineset in self.linesets]) | 280 total = sum([len(lineset) for lineset in self.linesets]) |
304 duplicated = 0 | 281 duplicated = 0 |
305 stats = self.stats | 282 stats = self.stats |
306 for num, couples in self._compute_sims(): | 283 for num, couples in self._compute_sims(): |
307 msg = [] | 284 msg = [] |
308 for lineset, idx in couples: | 285 for lineset, idx in couples: |
309 msg.append("==%s:%s" % (lineset.name, idx)) | 286 msg.append("==%s:%s" % (lineset.name, idx)) |
310 msg.sort() | 287 msg.sort() |
311 # pylint: disable=W0631 | 288 # pylint: disable=W0631 |
312 for line in lineset._real_lines[idx:idx+num]: | 289 for line in lineset._real_lines[idx:idx+num]: |
313 msg.append(line.rstrip()) | 290 msg.append(line.rstrip()) |
314 self.add_message('R0801', args=(len(couples), '\n'.join(msg))) | 291 self.add_message('R0801', args=(len(couples), '\n'.join(msg))) |
315 duplicated += num * (len(couples) - 1) | 292 duplicated += num * (len(couples) - 1) |
316 stats['nb_duplicated_lines'] = duplicated | 293 stats['nb_duplicated_lines'] = duplicated |
317 stats['percent_duplicated_lines'] = total and duplicated * 100. / total | 294 stats['percent_duplicated_lines'] = total and duplicated * 100. / total |
318 | 295 |
319 | 296 |
320 def register(linter): | 297 def register(linter): |
321 """required method to auto register this checker """ | 298 """required method to auto register this checker """ |
322 linter.register_checker(SimilarChecker(linter)) | 299 linter.register_checker(SimilarChecker(linter)) |
323 | 300 |
324 def usage(status=0): | 301 def usage(status=0): |
325 """display command line usage information""" | 302 """display command line usage information""" |
326 print "finds copy pasted blocks in a set of files" | 303 print "finds copy pasted blocks in a set of files" |
327 print | 304 print |
328 print 'Usage: symilar [-d|--duplicates min_duplicated_lines] \ | 305 print 'Usage: symilar [-d|--duplicates min_duplicated_lines] \ |
329 [-i|--ignore-comments] [--ignore-docstrings] [--ignore-imports] file1...' | 306 [-i|--ignore-comments] file1...' |
330 sys.exit(status) | 307 sys.exit(status) |
331 | 308 |
332 def Run(argv=None): | 309 def run(argv=None): |
333 """standalone command line access point""" | 310 """standalone command line access point""" |
334 if argv is None: | 311 if argv is None: |
335 argv = sys.argv[1:] | 312 argv = sys.argv[1:] |
336 from getopt import getopt | 313 from getopt import getopt |
337 s_opts = 'hdi' | 314 s_opts = 'hdi' |
338 l_opts = ('help', 'duplicates=', 'ignore-comments', 'ignore-imports', | 315 l_opts = ('help', 'duplicates=', 'ignore-comments') |
339 'ignore-docstrings') | |
340 min_lines = 4 | 316 min_lines = 4 |
341 ignore_comments = False | 317 ignore_comments = False |
342 ignore_docstrings = False | |
343 ignore_imports = False | |
344 opts, args = getopt(argv, s_opts, l_opts) | 318 opts, args = getopt(argv, s_opts, l_opts) |
345 for opt, val in opts: | 319 for opt, val in opts: |
346 if opt in ('-d', '--duplicates'): | 320 if opt in ('-d', '--duplicates'): |
347 min_lines = int(val) | 321 min_lines = int(val) |
348 elif opt in ('-h', '--help'): | 322 elif opt in ('-h', '--help'): |
349 usage() | 323 usage() |
350 elif opt in ('-i', '--ignore-comments'): | 324 elif opt in ('-i', '--ignore-comments'): |
351 ignore_comments = True | 325 ignore_comments = True |
352 elif opt in ('--ignore-docstrings',): | |
353 ignore_docstrings = True | |
354 elif opt in ('--ignore-imports',): | |
355 ignore_imports = True | |
356 if not args: | 326 if not args: |
357 usage(1) | 327 usage(1) |
358 sim = Similar(min_lines, ignore_comments, ignore_docstrings, ignore_imports) | 328 sim = Similar(min_lines, ignore_comments) |
359 for filename in args: | 329 for filename in args: |
360 sim.append_stream(filename, open(filename)) | 330 sim.append_stream(filename, open(filename)) |
361 sim.run() | 331 sim.run() |
362 sys.exit(0) | |
363 | 332 |
364 if __name__ == '__main__': | 333 if __name__ == '__main__': |
365 Run() | 334 run() |
OLD | NEW |