| OLD | NEW |
| 1 # pylint: disable=W0622 | 1 # pylint: disable=W0622 |
| 2 # Copyright (c) 2004-2013 LOGILAB S.A. (Paris, FRANCE). | 2 # Copyright (c) 2004-2013 LOGILAB S.A. (Paris, FRANCE). |
| 3 # http://www.logilab.fr/ -- mailto:contact@logilab.fr | 3 # http://www.logilab.fr/ -- mailto:contact@logilab.fr |
| 4 # | 4 # |
| 5 # This program is free software; you can redistribute it and/or modify it under | 5 # This program is free software; you can redistribute it and/or modify it under |
| 6 # the terms of the GNU General Public License as published by the Free Software | 6 # the terms of the GNU General Public License as published by the Free Software |
| 7 # Foundation; either version 2 of the License, or (at your option) any later | 7 # Foundation; either version 2 of the License, or (at your option) any later |
| 8 # version. | 8 # version. |
| 9 # | 9 # |
| 10 # This program is distributed in the hope that it will be useful, but WITHOUT | 10 # This program is distributed in the hope that it will be useful, but WITHOUT |
| 11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 12 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details | 12 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details |
| 13 # | 13 # |
| 14 # You should have received a copy of the GNU General Public License along with | 14 # You should have received a copy of the GNU General Public License along with |
| 15 # this program; if not, write to the Free Software Foundation, Inc., | 15 # this program; if not, write to the Free Software Foundation, Inc., |
| 16 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | 16 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
| 17 """a similarities / code duplication command line tool and pylint checker | 17 """a similarities / code duplication command line tool and pylint checker |
| 18 """ | 18 """ |
| 19 from __future__ import print_function |
| 19 import sys | 20 import sys |
| 20 from itertools import izip | 21 from collections import defaultdict |
| 21 | 22 |
| 22 from logilab.common.ureports import Table | 23 from logilab.common.ureports import Table |
| 23 | 24 |
| 24 from pylint.interfaces import IRawChecker | 25 from pylint.interfaces import IRawChecker |
| 25 from pylint.checkers import BaseChecker, table_lines_from_stats | 26 from pylint.checkers import BaseChecker, table_lines_from_stats |
| 26 | 27 |
| 28 import six |
| 29 from six.moves import zip |
| 30 |
| 27 | 31 |
| 28 class Similar(object): | 32 class Similar(object): |
| 29 """finds copy-pasted lines of code in a project""" | 33 """finds copy-pasted lines of code in a project""" |
| 30 | 34 |
| 31 def __init__(self, min_lines=4, ignore_comments=False, | 35 def __init__(self, min_lines=4, ignore_comments=False, |
| 32 ignore_docstrings=False, ignore_imports=False): | 36 ignore_docstrings=False, ignore_imports=False): |
| 33 self.min_lines = min_lines | 37 self.min_lines = min_lines |
| 34 self.ignore_comments = ignore_comments | 38 self.ignore_comments = ignore_comments |
| 35 self.ignore_docstrings = ignore_docstrings | 39 self.ignore_docstrings = ignore_docstrings |
| 36 self.ignore_imports = ignore_imports | 40 self.ignore_imports = ignore_imports |
| (...skipping 14 matching lines...) Expand all Loading... |
| 51 self.ignore_imports)) | 55 self.ignore_imports)) |
| 52 except UnicodeDecodeError: | 56 except UnicodeDecodeError: |
| 53 pass | 57 pass |
| 54 | 58 |
| 55 def run(self): | 59 def run(self): |
| 56 """start looking for similarities and display results on stdout""" | 60 """start looking for similarities and display results on stdout""" |
| 57 self._display_sims(self._compute_sims()) | 61 self._display_sims(self._compute_sims()) |
| 58 | 62 |
| 59 def _compute_sims(self): | 63 def _compute_sims(self): |
| 60 """compute similarities in appended files""" | 64 """compute similarities in appended files""" |
| 61 no_duplicates = {} | 65 no_duplicates = defaultdict(list) |
| 62 for num, lineset1, idx1, lineset2, idx2 in self._iter_sims(): | 66 for num, lineset1, idx1, lineset2, idx2 in self._iter_sims(): |
| 63 duplicate = no_duplicates.setdefault(num, []) | 67 duplicate = no_duplicates[num] |
| 64 for couples in duplicate: | 68 for couples in duplicate: |
| 65 if (lineset1, idx1) in couples or (lineset2, idx2) in couples: | 69 if (lineset1, idx1) in couples or (lineset2, idx2) in couples: |
| 66 couples.add((lineset1, idx1)) | 70 couples.add((lineset1, idx1)) |
| 67 couples.add((lineset2, idx2)) | 71 couples.add((lineset2, idx2)) |
| 68 break | 72 break |
| 69 else: | 73 else: |
| 70 duplicate.append(set([(lineset1, idx1), (lineset2, idx2)])) | 74 duplicate.append(set([(lineset1, idx1), (lineset2, idx2)])) |
| 71 sims = [] | 75 sims = [] |
| 72 for num, ensembles in no_duplicates.iteritems(): | 76 for num, ensembles in six.iteritems(no_duplicates): |
| 73 for couples in ensembles: | 77 for couples in ensembles: |
| 74 sims.append((num, couples)) | 78 sims.append((num, couples)) |
| 75 sims.sort() | 79 sims.sort() |
| 76 sims.reverse() | 80 sims.reverse() |
| 77 return sims | 81 return sims |
| 78 | 82 |
| 79 def _display_sims(self, sims): | 83 def _display_sims(self, sims): |
| 80 """display computed similarities on stdout""" | 84 """display computed similarities on stdout""" |
| 81 nb_lignes_dupliquees = 0 | 85 nb_lignes_dupliquees = 0 |
| 82 for num, couples in sims: | 86 for num, couples in sims: |
| 83 print | 87 print() |
| 84 print num, "similar lines in", len(couples), "files" | 88 print(num, "similar lines in", len(couples), "files") |
| 85 couples = sorted(couples) | 89 couples = sorted(couples) |
| 86 for lineset, idx in couples: | 90 for lineset, idx in couples: |
| 87 print "==%s:%s" % (lineset.name, idx) | 91 print("==%s:%s" % (lineset.name, idx)) |
| 88 # pylint: disable=W0631 | 92 # pylint: disable=W0631 |
| 89 for line in lineset._real_lines[idx:idx+num]: | 93 for line in lineset._real_lines[idx:idx+num]: |
| 90 print " ", line.rstrip() | 94 print(" ", line.rstrip()) |
| 91 nb_lignes_dupliquees += num * (len(couples)-1) | 95 nb_lignes_dupliquees += num * (len(couples)-1) |
| 92 nb_total_lignes = sum([len(lineset) for lineset in self.linesets]) | 96 nb_total_lignes = sum([len(lineset) for lineset in self.linesets]) |
| 93 print "TOTAL lines=%s duplicates=%s percent=%.2f" \ | 97 print("TOTAL lines=%s duplicates=%s percent=%.2f" \ |
| 94 % (nb_total_lignes, nb_lignes_dupliquees, | 98 % (nb_total_lignes, nb_lignes_dupliquees, |
| 95 nb_lignes_dupliquees*100. / nb_total_lignes) | 99 nb_lignes_dupliquees*100. / nb_total_lignes)) |
| 96 | 100 |
| 97 def _find_common(self, lineset1, lineset2): | 101 def _find_common(self, lineset1, lineset2): |
| 98 """find similarities in the two given linesets""" | 102 """find similarities in the two given linesets""" |
| 99 lines1 = lineset1.enumerate_stripped | 103 lines1 = lineset1.enumerate_stripped |
| 100 lines2 = lineset2.enumerate_stripped | 104 lines2 = lineset2.enumerate_stripped |
| 101 find = lineset2.find | 105 find = lineset2.find |
| 102 index1 = 0 | 106 index1 = 0 |
| 103 min_lines = self.min_lines | 107 min_lines = self.min_lines |
| 104 while index1 < len(lineset1): | 108 while index1 < len(lineset1): |
| 105 skip = 1 | 109 skip = 1 |
| 106 num = 0 | 110 num = 0 |
| 107 for index2 in find(lineset1[index1]): | 111 for index2 in find(lineset1[index1]): |
| 108 non_blank = 0 | 112 non_blank = 0 |
| 109 for num, ((_, line1), (_, line2)) in enumerate( | 113 for num, ((_, line1), (_, line2)) in enumerate( |
| 110 izip(lines1(index1), lines2(index2))): | 114 zip(lines1(index1), lines2(index2))): |
| 111 if line1 != line2: | 115 if line1 != line2: |
| 112 if non_blank > min_lines: | 116 if non_blank > min_lines: |
| 113 yield num, lineset1, index1, lineset2, index2 | 117 yield num, lineset1, index1, lineset2, index2 |
| 114 skip = max(skip, num) | 118 skip = max(skip, num) |
| 115 break | 119 break |
| 116 if line1: | 120 if line1: |
| 117 non_blank += 1 | 121 non_blank += 1 |
| 118 else: | 122 else: |
| 119 # we may have reach the end | 123 # we may have reach the end |
| 120 num += 1 | 124 num += 1 |
| (...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 200 #if line: | 204 #if line: |
| 201 yield idx, line | 205 yield idx, line |
| 202 idx += 1 | 206 idx += 1 |
| 203 | 207 |
| 204 def find(self, stripped_line): | 208 def find(self, stripped_line): |
| 205 """return positions of the given stripped line in this set""" | 209 """return positions of the given stripped line in this set""" |
| 206 return self._index.get(stripped_line, ()) | 210 return self._index.get(stripped_line, ()) |
| 207 | 211 |
| 208 def _mk_index(self): | 212 def _mk_index(self): |
| 209 """create the index for this set""" | 213 """create the index for this set""" |
| 210 index = {} | 214 index = defaultdict(list) |
| 211 for line_no, line in enumerate(self._stripped_lines): | 215 for line_no, line in enumerate(self._stripped_lines): |
| 212 if line: | 216 if line: |
| 213 index.setdefault(line, []).append(line_no) | 217 index[line].append(line_no) |
| 214 return index | 218 return index |
| 215 | 219 |
| 216 | 220 |
| 217 MSGS = {'R0801': ('Similar lines in %s files\n%s', | 221 MSGS = {'R0801': ('Similar lines in %s files\n%s', |
| 218 'duplicate-code', | 222 'duplicate-code', |
| 219 'Indicates that a set of similar lines has been detected \ | 223 'Indicates that a set of similar lines has been detected \ |
| 220 among multiple file. This usually means that the code should \ | 224 among multiple file. This usually means that the code should \ |
| 221 be refactored to avoid this duplication.')} | 225 be refactored to avoid this duplication.')} |
| 222 | 226 |
| 223 def report_similarities(sect, stats, old_stats): | 227 def report_similarities(sect, stats, old_stats): |
| (...skipping 92 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 316 stats['nb_duplicated_lines'] = duplicated | 320 stats['nb_duplicated_lines'] = duplicated |
| 317 stats['percent_duplicated_lines'] = total and duplicated * 100. / total | 321 stats['percent_duplicated_lines'] = total and duplicated * 100. / total |
| 318 | 322 |
| 319 | 323 |
| 320 def register(linter): | 324 def register(linter): |
| 321 """required method to auto register this checker """ | 325 """required method to auto register this checker """ |
| 322 linter.register_checker(SimilarChecker(linter)) | 326 linter.register_checker(SimilarChecker(linter)) |
| 323 | 327 |
| 324 def usage(status=0): | 328 def usage(status=0): |
| 325 """display command line usage information""" | 329 """display command line usage information""" |
| 326 print "finds copy pasted blocks in a set of files" | 330 print("finds copy pasted blocks in a set of files") |
| 327 print | 331 print() |
| 328 print 'Usage: symilar [-d|--duplicates min_duplicated_lines] \ | 332 print('Usage: symilar [-d|--duplicates min_duplicated_lines] \ |
| 329 [-i|--ignore-comments] [--ignore-docstrings] [--ignore-imports] file1...' | 333 [-i|--ignore-comments] [--ignore-docstrings] [--ignore-imports] file1...') |
| 330 sys.exit(status) | 334 sys.exit(status) |
| 331 | 335 |
| 332 def Run(argv=None): | 336 def Run(argv=None): |
| 333 """standalone command line access point""" | 337 """standalone command line access point""" |
| 334 if argv is None: | 338 if argv is None: |
| 335 argv = sys.argv[1:] | 339 argv = sys.argv[1:] |
| 336 from getopt import getopt | 340 from getopt import getopt |
| 337 s_opts = 'hdi' | 341 s_opts = 'hdi' |
| 338 l_opts = ('help', 'duplicates=', 'ignore-comments', 'ignore-imports', | 342 l_opts = ('help', 'duplicates=', 'ignore-comments', 'ignore-imports', |
| 339 'ignore-docstrings') | 343 'ignore-docstrings') |
| (...skipping 16 matching lines...) Expand all Loading... |
| 356 if not args: | 360 if not args: |
| 357 usage(1) | 361 usage(1) |
| 358 sim = Similar(min_lines, ignore_comments, ignore_docstrings, ignore_imports) | 362 sim = Similar(min_lines, ignore_comments, ignore_docstrings, ignore_imports) |
| 359 for filename in args: | 363 for filename in args: |
| 360 sim.append_stream(filename, open(filename)) | 364 sim.append_stream(filename, open(filename)) |
| 361 sim.run() | 365 sim.run() |
| 362 sys.exit(0) | 366 sys.exit(0) |
| 363 | 367 |
| 364 if __name__ == '__main__': | 368 if __name__ == '__main__': |
| 365 Run() | 369 Run() |
| OLD | NEW |