| Index: testing/gmock/scripts/generator/cpp/tokenize.py
 | 
| diff --git a/testing/gmock/scripts/generator/cpp/tokenize.py b/testing/gmock/scripts/generator/cpp/tokenize.py
 | 
| new file mode 100755
 | 
| index 0000000000000000000000000000000000000000..28c334529980b540797034f90a1031001a6c0bcd
 | 
| --- /dev/null
 | 
| +++ b/testing/gmock/scripts/generator/cpp/tokenize.py
 | 
| @@ -0,0 +1,287 @@
 | 
| +#!/usr/bin/env python
 | 
| +#
 | 
| +# Copyright 2007 Neal Norwitz
 | 
| +# Portions Copyright 2007 Google Inc.
 | 
| +#
 | 
| +# Licensed under the Apache License, Version 2.0 (the "License");
 | 
| +# you may not use this file except in compliance with the License.
 | 
| +# You may obtain a copy of the License at
 | 
| +#
 | 
| +#      http://www.apache.org/licenses/LICENSE-2.0
 | 
| +#
 | 
| +# Unless required by applicable law or agreed to in writing, software
 | 
| +# distributed under the License is distributed on an "AS IS" BASIS,
 | 
| +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
| +# See the License for the specific language governing permissions and
 | 
| +# limitations under the License.
 | 
| +
 | 
| +"""Tokenize C++ source code."""
 | 
| +
 | 
| +__author__ = 'nnorwitz@google.com (Neal Norwitz)'
 | 
| +
 | 
| +
 | 
| +try:
 | 
| +    # Python 3.x
 | 
| +    import builtins
 | 
| +except ImportError:
 | 
| +    # Python 2.x
 | 
| +    import __builtin__ as builtins
 | 
| +
 | 
| +
 | 
| +import sys
 | 
| +
 | 
| +from cpp import utils
 | 
| +
 | 
| +
 | 
| +if not hasattr(builtins, 'set'):
 | 
| +    # Nominal support for Python 2.3.
 | 
| +    from sets import Set as set
 | 
| +
 | 
| +
 | 
| +# Add $ as a valid identifier char since so much code uses it.
 | 
| +_letters = 'abcdefghijklmnopqrstuvwxyz'
 | 
| +VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
 | 
| +HEX_DIGITS = set('0123456789abcdefABCDEF')
 | 
| +INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
 | 
| +
 | 
| +
 | 
| +# C++0x string preffixes.
 | 
| +_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
 | 
| +
 | 
| +
 | 
| +# Token types.
 | 
| +UNKNOWN = 'UNKNOWN'
 | 
| +SYNTAX = 'SYNTAX'
 | 
| +CONSTANT = 'CONSTANT'
 | 
| +NAME = 'NAME'
 | 
| +PREPROCESSOR = 'PREPROCESSOR'
 | 
| +
 | 
| +# Where the token originated from.  This can be used for backtracking.
 | 
| +# It is always set to WHENCE_STREAM in this code.
 | 
| +WHENCE_STREAM, WHENCE_QUEUE = range(2)
 | 
| +
 | 
| +
 | 
| +class Token(object):
 | 
| +    """Data container to represent a C++ token.
 | 
| +
 | 
| +    Tokens can be identifiers, syntax char(s), constants, or
 | 
| +    pre-processor directives.
 | 
| +
 | 
| +    start contains the index of the first char of the token in the source
 | 
| +    end contains the index of the last char of the token in the source
 | 
| +    """
 | 
| +
 | 
| +    def __init__(self, token_type, name, start, end):
 | 
| +        self.token_type = token_type
 | 
| +        self.name = name
 | 
| +        self.start = start
 | 
| +        self.end = end
 | 
| +        self.whence = WHENCE_STREAM
 | 
| +
 | 
| +    def __str__(self):
 | 
| +        if not utils.DEBUG:
 | 
| +            return 'Token(%r)' % self.name
 | 
| +        return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
 | 
| +
 | 
| +    __repr__ = __str__
 | 
| +
 | 
| +
 | 
| +def _GetString(source, start, i):
 | 
| +    i = source.find('"', i+1)
 | 
| +    while source[i-1] == '\\':
 | 
| +        # Count the trailing backslashes.
 | 
| +        backslash_count = 1
 | 
| +        j = i - 2
 | 
| +        while source[j] == '\\':
 | 
| +            backslash_count += 1
 | 
| +            j -= 1
 | 
| +        # When trailing backslashes are even, they escape each other.
 | 
| +        if (backslash_count % 2) == 0:
 | 
| +            break
 | 
| +        i = source.find('"', i+1)
 | 
| +    return i + 1
 | 
| +
 | 
| +
 | 
| +def _GetChar(source, start, i):
 | 
| +    # NOTE(nnorwitz): may not be quite correct, should be good enough.
 | 
| +    i = source.find("'", i+1)
 | 
| +    while source[i-1] == '\\':
 | 
| +        # Need to special case '\\'.
 | 
| +        if (i - 2) > start and source[i-2] == '\\':
 | 
| +            break
 | 
| +        i = source.find("'", i+1)
 | 
| +    # Try to handle unterminated single quotes (in a #if 0 block).
 | 
| +    if i < 0:
 | 
| +        i = start
 | 
| +    return i + 1
 | 
| +
 | 
| +
 | 
| +def GetTokens(source):
 | 
| +    """Returns a sequence of Tokens.
 | 
| +
 | 
| +    Args:
 | 
| +      source: string of C++ source code.
 | 
| +
 | 
| +    Yields:
 | 
| +      Token that represents the next token in the source.
 | 
| +    """
 | 
| +    # Cache various valid character sets for speed.
 | 
| +    valid_identifier_chars = VALID_IDENTIFIER_CHARS
 | 
| +    hex_digits = HEX_DIGITS
 | 
| +    int_or_float_digits = INT_OR_FLOAT_DIGITS
 | 
| +    int_or_float_digits2 = int_or_float_digits | set('.')
 | 
| +
 | 
| +    # Only ignore errors while in a #if 0 block.
 | 
| +    ignore_errors = False
 | 
| +    count_ifs = 0
 | 
| +
 | 
| +    i = 0
 | 
| +    end = len(source)
 | 
| +    while i < end:
 | 
| +        # Skip whitespace.
 | 
| +        while i < end and source[i].isspace():
 | 
| +            i += 1
 | 
| +        if i >= end:
 | 
| +            return
 | 
| +
 | 
| +        token_type = UNKNOWN
 | 
| +        start = i
 | 
| +        c = source[i]
 | 
| +        if c.isalpha() or c == '_':              # Find a string token.
 | 
| +            token_type = NAME
 | 
| +            while source[i] in valid_identifier_chars:
 | 
| +                i += 1
 | 
| +            # String and character constants can look like a name if
 | 
| +            # they are something like L"".
 | 
| +            if (source[i] == "'" and (i - start) == 1 and
 | 
| +                source[start:i] in 'uUL'):
 | 
| +                # u, U, and L are valid C++0x character preffixes.
 | 
| +                token_type = CONSTANT
 | 
| +                i = _GetChar(source, start, i)
 | 
| +            elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
 | 
| +                token_type = CONSTANT
 | 
| +                i = _GetString(source, start, i)
 | 
| +        elif c == '/' and source[i+1] == '/':    # Find // comments.
 | 
| +            i = source.find('\n', i)
 | 
| +            if i == -1:  # Handle EOF.
 | 
| +                i = end
 | 
| +            continue
 | 
| +        elif c == '/' and source[i+1] == '*':    # Find /* comments. */
 | 
| +            i = source.find('*/', i) + 2
 | 
| +            continue
 | 
| +        elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
 | 
| +            token_type = SYNTAX
 | 
| +            i += 1
 | 
| +            new_ch = source[i]
 | 
| +            if new_ch == c:
 | 
| +                i += 1
 | 
| +            elif c == '-' and new_ch == '>':
 | 
| +                i += 1
 | 
| +            elif new_ch == '=':
 | 
| +                i += 1
 | 
| +        elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
 | 
| +            token_type = SYNTAX
 | 
| +            i += 1
 | 
| +            if c == '.' and source[i].isdigit():
 | 
| +                token_type = CONSTANT
 | 
| +                i += 1
 | 
| +                while source[i] in int_or_float_digits:
 | 
| +                    i += 1
 | 
| +                # Handle float suffixes.
 | 
| +                for suffix in ('l', 'f'):
 | 
| +                    if suffix == source[i:i+1].lower():
 | 
| +                        i += 1
 | 
| +                        break
 | 
| +        elif c.isdigit():                        # Find integer.
 | 
| +            token_type = CONSTANT
 | 
| +            if c == '0' and source[i+1] in 'xX':
 | 
| +                # Handle hex digits.
 | 
| +                i += 2
 | 
| +                while source[i] in hex_digits:
 | 
| +                    i += 1
 | 
| +            else:
 | 
| +                while source[i] in int_or_float_digits2:
 | 
| +                    i += 1
 | 
| +            # Handle integer (and float) suffixes.
 | 
| +            for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
 | 
| +                size = len(suffix)
 | 
| +                if suffix == source[i:i+size].lower():
 | 
| +                    i += size
 | 
| +                    break
 | 
| +        elif c == '"':                           # Find string.
 | 
| +            token_type = CONSTANT
 | 
| +            i = _GetString(source, start, i)
 | 
| +        elif c == "'":                           # Find char.
 | 
| +            token_type = CONSTANT
 | 
| +            i = _GetChar(source, start, i)
 | 
| +        elif c == '#':                           # Find pre-processor command.
 | 
| +            token_type = PREPROCESSOR
 | 
| +            got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
 | 
| +            if got_if:
 | 
| +                count_ifs += 1
 | 
| +            elif source[i:i+6] == '#endif':
 | 
| +                count_ifs -= 1
 | 
| +                if count_ifs == 0:
 | 
| +                    ignore_errors = False
 | 
| +
 | 
| +            # TODO(nnorwitz): handle preprocessor statements (\ continuations).
 | 
| +            while 1:
 | 
| +                i1 = source.find('\n', i)
 | 
| +                i2 = source.find('//', i)
 | 
| +                i3 = source.find('/*', i)
 | 
| +                i4 = source.find('"', i)
 | 
| +                # NOTE(nnorwitz): doesn't handle comments in #define macros.
 | 
| +                # Get the first important symbol (newline, comment, EOF/end).
 | 
| +                i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
 | 
| +
 | 
| +                # Handle #include "dir//foo.h" properly.
 | 
| +                if source[i] == '"':
 | 
| +                    i = source.find('"', i+1) + 1
 | 
| +                    assert i > 0
 | 
| +                    continue
 | 
| +                # Keep going if end of the line and the line ends with \.
 | 
| +                if not (i == i1 and source[i-1] == '\\'):
 | 
| +                    if got_if:
 | 
| +                        condition = source[start+4:i].lstrip()
 | 
| +                        if (condition.startswith('0') or
 | 
| +                            condition.startswith('(0)')):
 | 
| +                            ignore_errors = True
 | 
| +                    break
 | 
| +                i += 1
 | 
| +        elif c == '\\':                          # Handle \ in code.
 | 
| +            # This is different from the pre-processor \ handling.
 | 
| +            i += 1
 | 
| +            continue
 | 
| +        elif ignore_errors:
 | 
| +            # The tokenizer seems to be in pretty good shape.  This
 | 
| +            # raise is conditionally disabled so that bogus code
 | 
| +            # in an #if 0 block can be handled.  Since we will ignore
 | 
| +            # it anyways, this is probably fine.  So disable the
 | 
| +            # exception and  return the bogus char.
 | 
| +            i += 1
 | 
| +        else:
 | 
| +            sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
 | 
| +                             ('?', i, c, source[i-10:i+10]))
 | 
| +            raise RuntimeError('unexpected token')
 | 
| +
 | 
| +        if i <= 0:
 | 
| +            print('Invalid index, exiting now.')
 | 
| +            return
 | 
| +        yield Token(token_type, source[start:i], start, i)
 | 
| +
 | 
| +
 | 
| +if __name__ == '__main__':
 | 
| +    def main(argv):
 | 
| +        """Driver mostly for testing purposes."""
 | 
| +        for filename in argv[1:]:
 | 
| +            source = utils.ReadFile(filename)
 | 
| +            if source is None:
 | 
| +                continue
 | 
| +
 | 
| +            for token in GetTokens(source):
 | 
| +                print('%-12s: %s' % (token.token_type, token.name))
 | 
| +                # print('\r%6.2f%%' % (100.0 * index / token.end),)
 | 
| +            sys.stdout.write('\n')
 | 
| +
 | 
| +
 | 
| +    main(sys.argv)
 | 
| 
 |