testing/gmock/scripts/generator/cpp/tokenize.py - Issue 113807: Checkin a version of gmock, modified to use our boost_tuple in VS2005.

Unified Diff: testing/gmock/scripts/generator/cpp/tokenize.py

Issue 113807: Checkin a version of gmock, modified to use our boost_tuple in VS2005. (Closed)

Patch Set: Fix grammar issue. Created 11 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: testing/gmock/scripts/generator/cpp/tokenize.py

diff --git a/testing/gmock/scripts/generator/cpp/tokenize.py b/testing/gmock/scripts/generator/cpp/tokenize.py

new file mode 100755

index 0000000000000000000000000000000000000000..28c334529980b540797034f90a1031001a6c0bcd

--- /dev/null

+++ b/testing/gmock/scripts/generator/cpp/tokenize.py

@@ -0,0 +1,287 @@

+#!/usr/bin/env python

+# Licensed under the Apache License, Version 2.0 (the "License");

+# you may not use this file except in compliance with the License.

+# You may obtain a copy of the License at

+# http://www.apache.org/licenses/LICENSE-2.0

+# Unless required by applicable law or agreed to in writing, software

+# distributed under the License is distributed on an "AS IS" BASIS,

+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+# See the License for the specific language governing permissions and

+# limitations under the License.

+"""Tokenize C++ source code."""

+__author__ = 'nnorwitz@google.com (Neal Norwitz)'

+try:

+ # Python 3.x

+ import builtins

+except ImportError:

+ # Python 2.x

+ import __builtin__ as builtins

+import sys

+from cpp import utils

+if not hasattr(builtins, 'set'):

+ # Nominal support for Python 2.3.

+ from sets import Set as set

+# Add $ as a valid identifier char since so much code uses it.

+_letters = 'abcdefghijklmnopqrstuvwxyz'

+VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')

+HEX_DIGITS = set('0123456789abcdefABCDEF')

+INT_OR_FLOAT_DIGITS = set('01234567890eE-+')

+# C++0x string preffixes.

+_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))

+# Token types.

+UNKNOWN = 'UNKNOWN'

+SYNTAX = 'SYNTAX'

+CONSTANT = 'CONSTANT'

+NAME = 'NAME'

+PREPROCESSOR = 'PREPROCESSOR'

+# Where the token originated from. This can be used for backtracking.

+# It is always set to WHENCE_STREAM in this code.

+WHENCE_STREAM, WHENCE_QUEUE = range(2)

+class Token(object):

+ """Data container to represent a C++ token.

+ Tokens can be identifiers, syntax char(s), constants, or

+ pre-processor directives.

+ start contains the index of the first char of the token in the source

+ end contains the index of the last char of the token in the source

+ """

+ def __init__(self, token_type, name, start, end):

+ self.token_type = token_type

+ self.name = name

+ self.start = start

+ self.end = end

+ self.whence = WHENCE_STREAM

+ def __str__(self):

+ if not utils.DEBUG:

+ return 'Token(%r)' % self.name

+ return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)

+ __repr__ = __str__

+def _GetString(source, start, i):

+ i = source.find('"', i+1)

+ while source[i-1] == '\\':

+ # Count the trailing backslashes.

+ backslash_count = 1

+ j = i - 2

+ while source[j] == '\\':

+ backslash_count += 1

+ j -= 1

+ # When trailing backslashes are even, they escape each other.

+ if (backslash_count % 2) == 0:

+ break

+ i = source.find('"', i+1)

+ return i + 1

+def _GetChar(source, start, i):

+ # NOTE(nnorwitz): may not be quite correct, should be good enough.

+ i = source.find("'", i+1)

+ while source[i-1] == '\\':

+ # Need to special case '\\'.

+ if (i - 2) > start and source[i-2] == '\\':

+ break

+ i = source.find("'", i+1)

+ # Try to handle unterminated single quotes (in a #if 0 block).

+ if i < 0:

+ i = start

+ return i + 1

+def GetTokens(source):

+ """Returns a sequence of Tokens.

+ Args:

+ source: string of C++ source code.

+ Yields:

+ Token that represents the next token in the source.

+ """

+ # Cache various valid character sets for speed.

+ valid_identifier_chars = VALID_IDENTIFIER_CHARS

+ hex_digits = HEX_DIGITS

+ int_or_float_digits = INT_OR_FLOAT_DIGITS

+ int_or_float_digits2 = int_or_float_digits | set('.')

+ # Only ignore errors while in a #if 0 block.

+ ignore_errors = False

+ count_ifs = 0

+ i = 0

+ end = len(source)

+ while i < end:

+ # Skip whitespace.

+ while i < end and source[i].isspace():

+ i += 1

+ if i >= end:

+ return

+ token_type = UNKNOWN

+ start = i

+ c = source[i]

+ if c.isalpha() or c == '_': # Find a string token.

+ token_type = NAME

+ while source[i] in valid_identifier_chars:

+ i += 1

+ # String and character constants can look like a name if

+ # they are something like L"".

+ if (source[i] == "'" and (i - start) == 1 and

+ source[start:i] in 'uUL'):

+ # u, U, and L are valid C++0x character preffixes.

+ token_type = CONSTANT

+ i = _GetChar(source, start, i)

+ elif source[i] == "'" and source[start:i] in _STR_PREFIXES:

+ token_type = CONSTANT

+ i = _GetString(source, start, i)

+ elif c == '/' and source[i+1] == '/': # Find // comments.

+ i = source.find('\n', i)

+ if i == -1: # Handle EOF.

+ i = end

+ continue

+ elif c == '/' and source[i+1] == '*': # Find /* comments. */

+ i = source.find('*/', i) + 2

+ continue

+ elif c in ':+-<>&|*=': # : or :: (plus other chars).

+ token_type = SYNTAX

+ i += 1

+ new_ch = source[i]

+ if new_ch == c:

+ i += 1

+ elif c == '-' and new_ch == '>':

+ i += 1

+ elif new_ch == '=':

+ i += 1

+ elif c in '()[]{}~!?^%;/.,': # Handle single char tokens.

+ token_type = SYNTAX

+ i += 1

+ if c == '.' and source[i].isdigit():

+ token_type = CONSTANT

+ i += 1

+ while source[i] in int_or_float_digits:

+ i += 1

+ # Handle float suffixes.

+ for suffix in ('l', 'f'):

+ if suffix == source[i:i+1].lower():

+ i += 1

+ break

+ elif c.isdigit(): # Find integer.

+ token_type = CONSTANT

+ if c == '0' and source[i+1] in 'xX':

+ # Handle hex digits.

+ i += 2

+ while source[i] in hex_digits:

+ i += 1

+ else:

+ while source[i] in int_or_float_digits2:

+ i += 1

+ # Handle integer (and float) suffixes.

+ for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):

+ size = len(suffix)

+ if suffix == source[i:i+size].lower():

+ i += size

+ break

+ elif c == '"': # Find string.

+ token_type = CONSTANT

+ i = _GetString(source, start, i)

+ elif c == "'": # Find char.

+ token_type = CONSTANT

+ i = _GetChar(source, start, i)

+ elif c == '#': # Find pre-processor command.

+ token_type = PREPROCESSOR

+ got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()

+ if got_if:

+ count_ifs += 1

+ elif source[i:i+6] == '#endif':

+ count_ifs -= 1

+ if count_ifs == 0:

+ ignore_errors = False

+ # TODO(nnorwitz): handle preprocessor statements (\ continuations).

+ while 1:

+ i1 = source.find('\n', i)

+ i2 = source.find('//', i)

+ i3 = source.find('/*', i)

+ i4 = source.find('"', i)

+ # NOTE(nnorwitz): doesn't handle comments in #define macros.

+ # Get the first important symbol (newline, comment, EOF/end).

+ i = min([x for x in (i1, i2, i3, i4, end) if x != -1])

+ # Handle #include "dir//foo.h" properly.

+ if source[i] == '"':

+ i = source.find('"', i+1) + 1

+ assert i > 0

+ continue

+ # Keep going if end of the line and the line ends with \.

+ if not (i == i1 and source[i-1] == '\\'):

+ if got_if:

+ condition = source[start+4:i].lstrip()

+ if (condition.startswith('0') or

+ condition.startswith('(0)')):

+ ignore_errors = True

+ break

+ i += 1

+ elif c == '\\': # Handle \ in code.

+ # This is different from the pre-processor \ handling.

+ i += 1

+ continue

+ elif ignore_errors:

+ # The tokenizer seems to be in pretty good shape. This

+ # raise is conditionally disabled so that bogus code

+ # in an #if 0 block can be handled. Since we will ignore

+ # it anyways, this is probably fine. So disable the

+ # exception and return the bogus char.

+ i += 1

+ else:

+ sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %

+ ('?', i, c, source[i-10:i+10]))

+ raise RuntimeError('unexpected token')

+ if i <= 0:

+ print('Invalid index, exiting now.')

+ return

+ yield Token(token_type, source[start:i], start, i)

+if __name__ == '__main__':

+ def main(argv):

+ """Driver mostly for testing purposes."""

+ for filename in argv[1:]:

+ source = utils.ReadFile(filename)

+ if source is None:

+ continue

+ for token in GetTokens(source):

+ print('%-12s: %s' % (token.token_type, token.name))

+ # print('\r%6.2f%%' % (100.0 * index / token.end),)

+ sys.stdout.write('\n')

+ main(sys.argv)

« no previous file with comments | « testing/gmock/scripts/generator/cpp/keywords.py ('k') | testing/gmock/scripts/generator/cpp/utils.py » ('j') | no next file with comments »