| Index: third_party/pylint/checkers/similar.py | 
| =================================================================== | 
| --- third_party/pylint/checkers/similar.py	(revision 292986) | 
| +++ third_party/pylint/checkers/similar.py	(working copy) | 
| @@ -1,5 +1,5 @@ | 
| # pylint: disable=W0622 | 
| -# Copyright (c) 2004-2006 LOGILAB S.A. (Paris, FRANCE). | 
| +# Copyright (c) 2004-2013 LOGILAB S.A. (Paris, FRANCE). | 
| # http://www.logilab.fr/ -- mailto:contact@logilab.fr | 
| # | 
| # This program is free software; you can redistribute it and/or modify it under | 
| @@ -13,11 +13,9 @@ | 
| # | 
| # You should have received a copy of the GNU General Public License along with | 
| # this program; if not, write to the Free Software Foundation, Inc., | 
| -# 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. | 
| +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | 
| """a similarities / code duplication command line tool and pylint checker | 
| """ | 
| -from __future__ import generators | 
| - | 
| import sys | 
| from itertools import izip | 
|  | 
| @@ -27,23 +25,32 @@ | 
| from pylint.checkers import BaseChecker, table_lines_from_stats | 
|  | 
|  | 
| -class Similar: | 
| +class Similar(object): | 
| """finds copy-pasted lines of code in a project""" | 
|  | 
| def __init__(self, min_lines=4, ignore_comments=False, | 
| -                 ignore_docstrings=False): | 
| +                 ignore_docstrings=False, ignore_imports=False): | 
| self.min_lines = min_lines | 
| self.ignore_comments = ignore_comments | 
| self.ignore_docstrings = ignore_docstrings | 
| +        self.ignore_imports = ignore_imports | 
| self.linesets = [] | 
|  | 
| -    def append_stream(self, streamid, stream): | 
| +    def append_stream(self, streamid, stream, encoding=None): | 
| """append a file to search for similarities""" | 
| -        stream.seek(0) # XXX may be removed with astng > 0.23 | 
| -        self.linesets.append(LineSet(streamid, | 
| -                                     stream.readlines(), | 
| -                                     self.ignore_comments, | 
| -                                     self.ignore_docstrings)) | 
| +        stream.seek(0) # XXX may be removed with astroid > 0.23 | 
| +        if encoding is None: | 
| +            readlines = stream.readlines | 
| +        else: | 
| +            readlines = lambda: [line.decode(encoding) for line in stream] | 
| +        try: | 
| +            self.linesets.append(LineSet(streamid, | 
| +                                         readlines(), | 
| +                                         self.ignore_comments, | 
| +                                         self.ignore_docstrings, | 
| +                                         self.ignore_imports)) | 
| +        except UnicodeDecodeError: | 
| +            pass | 
|  | 
| def run(self): | 
| """start looking for similarities and display results on stdout""" | 
| @@ -56,15 +63,15 @@ | 
| duplicate = no_duplicates.setdefault(num, []) | 
| for couples in duplicate: | 
| if (lineset1, idx1) in couples or (lineset2, idx2) in couples: | 
| -                    couples.add( (lineset1, idx1) ) | 
| -                    couples.add( (lineset2, idx2) ) | 
| +                    couples.add((lineset1, idx1)) | 
| +                    couples.add((lineset2, idx2)) | 
| break | 
| else: | 
| -                duplicate.append( set([(lineset1, idx1), (lineset2, idx2)]) ) | 
| +                duplicate.append(set([(lineset1, idx1), (lineset2, idx2)])) | 
| sims = [] | 
| for num, ensembles in no_duplicates.iteritems(): | 
| for couples in ensembles: | 
| -                sims.append( (num, couples) ) | 
| +                sims.append((num, couples)) | 
| sims.sort() | 
| sims.reverse() | 
| return sims | 
| @@ -80,7 +87,7 @@ | 
| print "==%s:%s" % (lineset.name, idx) | 
| # pylint: disable=W0631 | 
| for line in lineset._real_lines[idx:idx+num]: | 
| -                print "  ", line, | 
| +                print "  ", line.rstrip() | 
| nb_lignes_dupliquees += num * (len(couples)-1) | 
| nb_total_lignes = sum([len(lineset) for lineset in self.linesets]) | 
| print "TOTAL lines=%s duplicates=%s percent=%.2f" \ | 
| @@ -97,10 +104,10 @@ | 
| while index1 < len(lineset1): | 
| skip = 1 | 
| num = 0 | 
| -            for index2 in find( lineset1[index1] ): | 
| +            for index2 in find(lineset1[index1]): | 
| non_blank = 0 | 
| for num, ((_, line1), (_, line2)) in enumerate( | 
| -                    izip(lines1(index1), lines2(index2))): | 
| +                        izip(lines1(index1), lines2(index2))): | 
| if line1 != line2: | 
| if non_blank > min_lines: | 
| yield num, lineset1, index1, lineset2, index2 | 
| @@ -125,7 +132,11 @@ | 
| for sim in self._find_common(lineset, lineset2): | 
| yield sim | 
|  | 
| -def stripped_lines(lines, ignore_comments, ignore_docstrings): | 
| +def stripped_lines(lines, ignore_comments, ignore_docstrings, ignore_imports): | 
| +    """return lines with leading/trailing whitespace and any ignored code | 
| +    features removed | 
| +    """ | 
| + | 
| strippedlines = [] | 
| docstring = None | 
| for line in lines: | 
| @@ -139,6 +150,9 @@ | 
| if line.endswith(docstring): | 
| docstring = None | 
| line = '' | 
| +        if ignore_imports: | 
| +            if line.startswith("import ") or line.startswith("from "): | 
| +                line = '' | 
| if ignore_comments: | 
| # XXX should use regex in checkers/format to avoid cutting | 
| # at a "#" in a string | 
| @@ -146,14 +160,16 @@ | 
| strippedlines.append(line) | 
| return strippedlines | 
|  | 
| -class LineSet: | 
| + | 
| +class LineSet(object): | 
| """Holds and indexes all the lines of a single source file""" | 
| def __init__(self, name, lines, ignore_comments=False, | 
| -                 ignore_docstrings=False): | 
| +                 ignore_docstrings=False, ignore_imports=False): | 
| self.name = name | 
| self._real_lines = lines | 
| self._stripped_lines = stripped_lines(lines, ignore_comments, | 
| -                                              ignore_docstrings) | 
| +                                              ignore_docstrings, | 
| +                                              ignore_imports) | 
| self._index = self._mk_index() | 
|  | 
| def __str__(self): | 
| @@ -194,11 +210,12 @@ | 
| index = {} | 
| for line_no, line in enumerate(self._stripped_lines): | 
| if line: | 
| -                index.setdefault(line, []).append( line_no ) | 
| +                index.setdefault(line, []).append(line_no) | 
| return index | 
|  | 
|  | 
| MSGS = {'R0801': ('Similar lines in %s files\n%s', | 
| +                  'duplicate-code', | 
| 'Indicates that a set of similar lines has been detected \ | 
| among multiple file. This usually means that the code should \ | 
| be refactored to avoid this duplication.')} | 
| @@ -232,14 +249,18 @@ | 
| ('ignore-comments', | 
| {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>', | 
| 'help': 'Ignore comments when computing similarities.'} | 
| -                ), | 
| +               ), | 
| ('ignore-docstrings', | 
| {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>', | 
| 'help': 'Ignore docstrings when computing similarities.'} | 
| -                ), | 
| -               ) | 
| +               ), | 
| +               ('ignore-imports', | 
| +                {'default' : False, 'type' : 'yn', 'metavar' : '<y or n>', | 
| +                 'help': 'Ignore imports when computing similarities.'} | 
| +               ), | 
| +              ) | 
| # reports | 
| -    reports = ( ('R0801', 'Duplication', report_similarities), ) # XXX actually a Refactoring message | 
| +    reports = (('RP0801', 'Duplication', report_similarities),) | 
|  | 
| def __init__(self, linter=None): | 
| BaseChecker.__init__(self, linter) | 
| @@ -259,6 +280,8 @@ | 
| self.ignore_comments = self.config.ignore_comments | 
| elif optname == 'ignore-docstrings': | 
| self.ignore_docstrings = self.config.ignore_docstrings | 
| +        elif optname == 'ignore-imports': | 
| +            self.ignore_imports = self.config.ignore_imports | 
|  | 
| def open(self): | 
| """init the checkers: reset linesets and statistics information""" | 
| @@ -273,7 +296,7 @@ | 
|  | 
| stream must implement the readlines method | 
| """ | 
| -        self.append_stream(self.linter.current_name, node.file_stream) | 
| +        self.append_stream(self.linter.current_name, node.file_stream, node.file_encoding) | 
|  | 
| def close(self): | 
| """compute and display similarities on closing (i.e. end of parsing)""" | 
| @@ -303,18 +326,21 @@ | 
| print "finds copy pasted blocks in a set of files" | 
| print | 
| print 'Usage: symilar [-d|--duplicates min_duplicated_lines] \ | 
| -[-i|--ignore-comments] file1...' | 
| +[-i|--ignore-comments] [--ignore-docstrings] [--ignore-imports] file1...' | 
| sys.exit(status) | 
|  | 
| -def run(argv=None): | 
| +def Run(argv=None): | 
| """standalone command line access point""" | 
| if argv is None: | 
| argv = sys.argv[1:] | 
| from getopt import getopt | 
| s_opts = 'hdi' | 
| -    l_opts = ('help', 'duplicates=', 'ignore-comments') | 
| +    l_opts = ('help', 'duplicates=', 'ignore-comments', 'ignore-imports', | 
| +              'ignore-docstrings') | 
| min_lines = 4 | 
| ignore_comments = False | 
| +    ignore_docstrings = False | 
| +    ignore_imports = False | 
| opts, args = getopt(argv, s_opts, l_opts) | 
| for opt, val in opts: | 
| if opt in ('-d', '--duplicates'): | 
| @@ -323,12 +349,17 @@ | 
| usage() | 
| elif opt in ('-i', '--ignore-comments'): | 
| ignore_comments = True | 
| +        elif opt in ('--ignore-docstrings',): | 
| +            ignore_docstrings = True | 
| +        elif opt in ('--ignore-imports',): | 
| +            ignore_imports = True | 
| if not args: | 
| usage(1) | 
| -    sim = Similar(min_lines, ignore_comments) | 
| +    sim = Similar(min_lines, ignore_comments, ignore_docstrings, ignore_imports) | 
| for filename in args: | 
| sim.append_stream(filename, open(filename)) | 
| sim.run() | 
| +    sys.exit(0) | 
|  | 
| if __name__ == '__main__': | 
| -    run() | 
| +    Run() | 
|  |