testing/gmock/scripts/generator/cpp/tokenize.py - Issue 115846: Retry to checkin a version of gmock, modified to use our boost_tuple in VS2005.

Side by Side Diff: testing/gmock/scripts/generator/cpp/tokenize.py

Issue 115846: Retry to checkin a version of gmock, modified to use our boost_tuple in VS2005. (Closed)

Patch Set: Created 11 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 #

	3 # Copyright 2007 Neal Norwitz

	4 # Portions Copyright 2007 Google Inc.

	5 #

	6 # Licensed under the Apache License, Version 2.0 (the "License");

	7 # you may not use this file except in compliance with the License.

	8 # You may obtain a copy of the License at

	9 #

	10 # http://www.apache.org/licenses/LICENSE-2.0

	11 #

	12 # Unless required by applicable law or agreed to in writing, software

	13 # distributed under the License is distributed on an "AS IS" BASIS,

	14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

	15 # See the License for the specific language governing permissions and

	16 # limitations under the License.

	17

	18 """Tokenize C++ source code."""

	19

	20 __author__ = 'nnorwitz@google.com (Neal Norwitz)'

	21

	22

	23 try:

	24 # Python 3.x

	25 import builtins

	26 except ImportError:

	27 # Python 2.x

	28 import __builtin__ as builtins

	29

	30

	31 import sys

	32

	33 from cpp import utils

	34

	35

	36 if not hasattr(builtins, 'set'):

	37 # Nominal support for Python 2.3.

	38 from sets import Set as set

	39

	40

	41 # Add $ as a valid identifier char since so much code uses it.

	42 _letters = 'abcdefghijklmnopqrstuvwxyz'

	43 VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')

	44 HEX_DIGITS = set('0123456789abcdefABCDEF')

	45 INT_OR_FLOAT_DIGITS = set('01234567890eE-+')

	46

	47

	48 # C++0x string preffixes.

	49 _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))

	50

	51

	52 # Token types.

	53 UNKNOWN = 'UNKNOWN'

	54 SYNTAX = 'SYNTAX'

	55 CONSTANT = 'CONSTANT'

	56 NAME = 'NAME'

	57 PREPROCESSOR = 'PREPROCESSOR'

	58

	59 # Where the token originated from. This can be used for backtracking.

	60 # It is always set to WHENCE_STREAM in this code.

	61 WHENCE_STREAM, WHENCE_QUEUE = range(2)

	62

	63

	64 class Token(object):

	65 """Data container to represent a C++ token.

	66

	67 Tokens can be identifiers, syntax char(s), constants, or

	68 pre-processor directives.

	69

	70 start contains the index of the first char of the token in the source

	71 end contains the index of the last char of the token in the source

	72 """

	73

	74 def __init__(self, token_type, name, start, end):

	75 self.token_type = token_type

	76 self.name = name

	77 self.start = start

	78 self.end = end

	79 self.whence = WHENCE_STREAM

	80

	81 def __str__(self):

	82 if not utils.DEBUG:

	83 return 'Token(%r)' % self.name

	84 return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)

	85

	86 __repr__ = __str__

	87

	88

	89 def _GetString(source, start, i):

	90 i = source.find('"', i+1)

	91 while source[i-1] == '\\':

	92 # Count the trailing backslashes.

	93 backslash_count = 1

	94 j = i - 2

	95 while source[j] == '\\':

	96 backslash_count += 1

	97 j -= 1

	98 # When trailing backslashes are even, they escape each other.

	99 if (backslash_count % 2) == 0:

	100 break

	101 i = source.find('"', i+1)

	102 return i + 1

	103

	104

	105 def _GetChar(source, start, i):

	106 # NOTE(nnorwitz): may not be quite correct, should be good enough.

	107 i = source.find("'", i+1)

	108 while source[i-1] == '\\':

	109 # Need to special case '\\'.

	110 if (i - 2) > start and source[i-2] == '\\':

	111 break

	112 i = source.find("'", i+1)

	113 # Try to handle unterminated single quotes (in a #if 0 block).

	114 if i < 0:

	115 i = start

	116 return i + 1

	117

	118

	119 def GetTokens(source):

	120 """Returns a sequence of Tokens.

	121

	122 Args:

	123 source: string of C++ source code.

	124

	125 Yields:

	126 Token that represents the next token in the source.

	127 """

	128 # Cache various valid character sets for speed.

	129 valid_identifier_chars = VALID_IDENTIFIER_CHARS

	130 hex_digits = HEX_DIGITS

	131 int_or_float_digits = INT_OR_FLOAT_DIGITS

	132 int_or_float_digits2 = int_or_float_digits \| set('.')

	133

	134 # Only ignore errors while in a #if 0 block.

	135 ignore_errors = False

	136 count_ifs = 0

	137

	138 i = 0

	139 end = len(source)

	140 while i < end:

	141 # Skip whitespace.

	142 while i < end and source[i].isspace():

	143 i += 1

	144 if i >= end:

	145 return

	146

	147 token_type = UNKNOWN

	148 start = i

	149 c = source[i]

	150 if c.isalpha() or c == '_': # Find a string token.

	151 token_type = NAME

	152 while source[i] in valid_identifier_chars:

	153 i += 1

	154 # String and character constants can look like a name if

	155 # they are something like L"".

	156 if (source[i] == "'" and (i - start) == 1 and

	157 source[start:i] in 'uUL'):

	158 # u, U, and L are valid C++0x character preffixes.

	159 token_type = CONSTANT

	160 i = _GetChar(source, start, i)

	161 elif source[i] == "'" and source[start:i] in _STR_PREFIXES:

	162 token_type = CONSTANT

	163 i = _GetString(source, start, i)

	164 elif c == '/' and source[i+1] == '/': # Find // comments.

	165 i = source.find('\n', i)

	166 if i == -1: # Handle EOF.

	167 i = end

	168 continue

	169 elif c == '/' and source[i+1] == '': # Find / comments. */

	170 i = source.find('*/', i) + 2

	171 continue

	172 elif c in ':+-<>&\|*=': # : or :: (plus other chars).

	173 token_type = SYNTAX

	174 i += 1

	175 new_ch = source[i]

	176 if new_ch == c:

	177 i += 1

	178 elif c == '-' and new_ch == '>':

	179 i += 1

	180 elif new_ch == '=':

	181 i += 1

	182 elif c in '()[]{}~!?^%;/.,': # Handle single char tokens.

	183 token_type = SYNTAX

	184 i += 1

	185 if c == '.' and source[i].isdigit():

	186 token_type = CONSTANT

	187 i += 1

	188 while source[i] in int_or_float_digits:

	189 i += 1

	190 # Handle float suffixes.

	191 for suffix in ('l', 'f'):

	192 if suffix == source[i:i+1].lower():

	193 i += 1

	194 break

	195 elif c.isdigit(): # Find integer.

	196 token_type = CONSTANT

	197 if c == '0' and source[i+1] in 'xX':

	198 # Handle hex digits.

	199 i += 2

	200 while source[i] in hex_digits:

	201 i += 1

	202 else:

	203 while source[i] in int_or_float_digits2:

	204 i += 1

	205 # Handle integer (and float) suffixes.

	206 for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):

	207 size = len(suffix)

	208 if suffix == source[i:i+size].lower():

	209 i += size

	210 break

	211 elif c == '"': # Find string.

	212 token_type = CONSTANT

	213 i = _GetString(source, start, i)

	214 elif c == "'": # Find char.

	215 token_type = CONSTANT

	216 i = _GetChar(source, start, i)

	217 elif c == '#': # Find pre-processor command.

	218 token_type = PREPROCESSOR

	219 got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()

	220 if got_if:

	221 count_ifs += 1

	222 elif source[i:i+6] == '#endif':

	223 count_ifs -= 1

	224 if count_ifs == 0:

	225 ignore_errors = False

	226

	227 # TODO(nnorwitz): handle preprocessor statements (\ continuations).

	228 while 1:

	229 i1 = source.find('\n', i)

	230 i2 = source.find('//', i)

	231 i3 = source.find('/*', i)

	232 i4 = source.find('"', i)

	233 # NOTE(nnorwitz): doesn't handle comments in #define macros.

	234 # Get the first important symbol (newline, comment, EOF/end).

	235 i = min([x for x in (i1, i2, i3, i4, end) if x != -1])

	236

	237 # Handle #include "dir//foo.h" properly.

	238 if source[i] == '"':

	239 i = source.find('"', i+1) + 1

	240 assert i > 0

	241 continue

	242 # Keep going if end of the line and the line ends with \.

	243 if not (i == i1 and source[i-1] == '\\'):

	244 if got_if:

	245 condition = source[start+4:i].lstrip()

	246 if (condition.startswith('0') or

	247 condition.startswith('(0)')):

	248 ignore_errors = True

	249 break

	250 i += 1

	251 elif c == '\\': # Handle \ in code.

	252 # This is different from the pre-processor \ handling.

	253 i += 1

	254 continue

	255 elif ignore_errors:

	256 # The tokenizer seems to be in pretty good shape. This

	257 # raise is conditionally disabled so that bogus code

	258 # in an #if 0 block can be handled. Since we will ignore

	259 # it anyways, this is probably fine. So disable the

	260 # exception and return the bogus char.

	261 i += 1

	262 else:

	263 sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %

	264 ('?', i, c, source[i-10:i+10]))

	265 raise RuntimeError('unexpected token')

	266

	267 if i <= 0:

	268 print('Invalid index, exiting now.')

	269 return

	270 yield Token(token_type, source[start:i], start, i)

	271

	272

	273 if __name__ == '__main__':

	274 def main(argv):

	275 """Driver mostly for testing purposes."""

	276 for filename in argv[1:]:

	277 source = utils.ReadFile(filename)

	278 if source is None:

	279 continue

	280

	281 for token in GetTokens(source):

	282 print('%-12s: %s' % (token.token_type, token.name))

	283 # print('\r%6.2f%%' % (100.0 * index / token.end),)

	284 sys.stdout.write('\n')

	285

	286

	287 main(sys.argv)

OLD	NEW

« no previous file with comments | « testing/gmock/scripts/generator/cpp/keywords.py ('k') | testing/gmock/scripts/generator/cpp/utils.py » ('j') | no next file with comments »