tools/idl_parser/idl_lexer.py - Issue 13498002: Add WebIDL compliant parser plus tests

Side by Side Diff: tools/idl_parser/idl_lexer.py

Issue 13498002: Add WebIDL compliant parser plus tests (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Clean up IDLNode, make members private Created 7 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """ Lexer for PPAPI IDL

	7

	8 The lexer is uses the PLY to build a tokenizer which understands WebIDL
	sehr 2013/04/08 16:41:20 s/is uses/uses/ s/is uses/uses/ noelallen1 2013/04/11 21:52:09 Done. Show quoted text On 2013/04/08 16:41:20, sehr1 wrote: > s/is uses/uses/ Done.
	9 tokens. The type of the tokens that match WebIDL will be:

	10 1- The grammar name exactly: float, integer, string, identifier

	11 2- The grammer for single characters operators: '?', ':', '{', ...
	sehr 2013/04/08 16:41:20 grammar grammar noelallen1 2013/04/11 21:52:09 Done. Show quoted text On 2013/04/08 16:41:20, sehr1 wrote: > grammar Done.
	12 3- The uppercase version of the multicharacter string for keywords

	13 ANY, ATTRIBUTE, BOOLEAN, ...

	14

	15 In addition, there are Pepper specific tokens for comments and inline blocks,

	16 and multicharacter operators such as >>, <<, and ...

	17

	18 WebIDL, and WebIDL regular expressions can be found at:

	19 http://www.w3.org/TR/2012/CR-WebIDL-20120419/

	20 PLY can be found at:

	21 http://www.dabeaz.com/ply/

	22 """

	23

	24 import optparse

	25 import os.path

	26 import re

	27 import sys

	28

	29 #

	30 # Try to load the ply module, if not, then assume it is in the third_party

	31 # directory, relative to ppapi

	32 #

	33 try:

	34 # pylint: disable=F0401

	35 from ply import lex

	36 except:

	37 module_path, module_name = os.path.split(__file__)

	38 third_party = os.path.join(module_path, '..', '..', 'third_party')

	39 sys.path.append(third_party)

	40 # pylint: disable=F0401

	41 from ply import lex

	42

	43 #

	44 # IDL Lexer

	45 #

	46 class IDLLexer(object):

	47 # 'tokens' is a value required by lex which specifies the complete list

	48 # of valid token types.

	49 tokens = [

	50 # Data types

	51 'float',

	52 'integer',

	53 'string',

	54

	55 # Operators

	56 'ELLIPSIS',

	57 'LSHIFT',

	58 'RSHIFT',

	59

	60 # Symbol and keywords types

	61 'COMMENT',

	62 'identifier',

	63

	64 # Pepper Extras

	65 'INLINE',

	66 ]

	67

	68 # 'keywords' is a map of string to token type. All tokens matching

	69 # KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine

	70 # if the token is actually a keyword.

	71 keywords = {

	72 'any' : 'ANY',

	73 'attribute' : 'ATTRIBUTE',

	74 'boolean' : 'BOOLEAN',

	75 'byte' : 'BYTE',

	76 'callback' : 'CALLBACK',

	77 'const' : 'CONST',

	78 'creator' : 'CREATOR',

	79 'Date' : 'DATE',

	80 'deleter' : 'DELETER',

	81 'dictionary' : 'DICTIONARY',

	82 'DOMString' : 'DOMSTRING',

	83 'double' : 'DOUBLE',

	84 'enum' : 'ENUM',

	85 'false' : 'FALSE',

	86 'float' : 'FLOAT',

	87 'exception' : 'EXCEPTION',

	88 'getter': 'GETTER',

	89 'implements' : 'IMPLEMENTS',

	90 'Infinity' : 'INFINITY',

	91 'inherit' : 'INHERIT',

	92 'interface' : 'INTERFACE',

	93 'label' : 'LABEL',

	94 'legacycaller' : 'LEGACYCALLER',

	95 'long' : 'LONG',

	96 'namespace' : 'NAMESPACE',

	97 'Nan' : 'NAN',

	98 'null' : 'NULL',

	99 'object' : 'OBJECT',

	100 'octet' : 'OCTET',

	101 'optional' : 'OPTIONAL',

	102 'or' : 'OR',

	103 'partial' : 'PARTIAL',

	104 'readonly' : 'READONLY',

	105 'sequence' : 'SEQUENCE',

	106 'setter': 'SETTER',

	107 'short' : 'SHORT',

	108 'static' : 'STATIC',

	109 'stringifier' : 'STRINGIFIER',

	110 'struct' : 'STRUCT',

	111 'typedef' : 'TYPEDEF',

	112 'true' : 'TRUE',

	113 'unsigned' : 'UNSIGNED',

	114 'unrestricted' : 'UNRESTRICTED',

	115 'void' : 'VOID'

	116 }

	117

	118 # Add keywords

	119 for key in keywords:

	120 tokens.append(keywords[key])

	121

	122 # 'literals' is a value expected by lex which specifies a list of valid

	123 # literal tokens, meaning the token type and token value are identical.

	124 literals = '"*.(){}[],;:=+-/~\|&^?<>'

	125

	126 # Token definitions

	127 #

	128 # Lex assumes any value or function in the form of 't_<TYPE>' represents a

	129 # regular expression where a match will emit a token of type <TYPE>. In the

	130 # case of a function, the function is called when a match is made. These

	131 # definitions come from WebIDL.

	132

	133 # 't_ignore' is a special match of items to ignore

	134 t_ignore = ' \t'

	135

	136 # Constant values

	137 t_integer = r'-?(0[Xx][0-9A-Fa-f]+)\|([0-9]+[uU]?)'

	138 t_float = r'-?(\d+\.\d\|\d\.\d+)([Ee][+-]?\d+)?\|-?\d+[Ee][+-]?\d+'

	139

	140 # Special multi-character operators

	141 t_ELLIPSIS = r'\.\.\.'

	142 t_LSHIFT = r'<<'

	143 t_RSHIFT = r'>>'

	144

	145 # A line ending '\n', we use this to increment the line number

	146 def t_LINE_END(self, t):

	147 r'\n+'

	148 self.AddLines(len(t.value))

	149

	150 # We do not process escapes in the IDL strings. Strings are exclusively

	151 # used for attributes and enums, and not used as typical 'C' constants.

	152 def t_string(self, t):

	153 r'"[^"]*"'

	154 t.value = t.value[1:-1]

	155 self.AddLines(t.value.count('\n'))

	156 return t

	157

	158 # A C or C++ style comment: /* xxx */ or //

	159 def t_COMMENT(self, t):

	160 r'(/\(.\|\n)?\/)\|(//.(\n[ \t]//.)*)'
	sehr 2013/04/08 16:41:20 what is the "?" there for? what is the "?" there for? noelallen1 2013/04/11 21:52:09 I will add a new test to the lexer to vet comments Show quoted text On 2013/04/08 16:41:20, sehr1 wrote: > what is the "?" there for? I will add a new test to the lexer to vet comments, and validate.
	161 self.AddLines(t.value.count('\n'))

	162 return t

	163

	164 # Return a "preprocessor" inline block

	165 def t_INLINE(self, t):

	166 r'\#inline (.\|\n)?\#endinl.'

	167 self.AddLines(t.value.count('\n'))

	168 return t

	169

	170 # A symbol or keyword.

	171 def t_KEYWORD_OR_SYMBOL(self, t):

	172 r'_?[A-Za-z][A-Za-z_0-9]*'

	173

	174 # All non-keywords are assumed to be symbols

	175 t.type = self.keywords.get(t.value, 'identifier')

	176

	177 # We strip leading underscores so that you can specify symbols with the same

	178 # value as a keywords (E.g. a dictionary named 'interface').

	179 if t.value[0] == '_':

	180 t.value = t.value[1:]

	181 return t

	182

	183 def t_ANY_error(self, t):

	184 msg = 'Unrecognized input'

	185 line = self.lexobj.lineno

	186

	187 # If that line has not been accounted for, then we must have hit

	188 # EoF, so compute the beginning of the line that caused the problem.

	189 if line >= len(self.index):

	190 # Find the offset in the line of the first word causing the issue

	191 word = t.value.split()[0]

	192 offs = self.lines[line - 1].find(word)

	193 # Add the computed line's starting position

	194 self.index.append(self.lexobj.lexpos - offs)

	195 msg = 'Unexpected EoF reached after'

	196

	197 pos = self.lexobj.lexpos - self.index[line]

	198 out = self.ErrorMessage(line, pos, msg)

	199 sys.stderr.write(out + '\n')

	200 self.lex_errors += 1

	201

	202

	203 def AddLines(self, count):

	204 # Set the lexer position for the beginning of the next line. In the case

	205 # of multiple lines, tokens can not exist on any of the lines except the

	206 # last one, so the recorded value for previous lines are unused. We still

	207 # fill the array however, to make sure the line count is correct.

	208 self.lexobj.lineno += count

	209 # pylint: disable=W0612

	210 for i in range(count):

	211 self.index.append(self.lexobj.lexpos)

	212

	213 def FileLineMsg(self, line, msg):

	214 # Generate a message containing the file and line number of a token.

	215 filename = self.lexobj.filename

	216 if filename:

	217 return "%s(%d) : %s" % (filename, line + 1, msg)

	218 return "<BuiltIn> : %s" % msg

	219

	220 def SourceLine(self, line, pos):

	221 # Create a source line marker

	222 caret = '\t^'.expandtabs(pos)
	sehr 2013/04/08 16:41:20 This is a "cute" way of generating pos * ' '. Is This is a "cute" way of generating pos * ' '. Is this the best way to do this? noelallen1 2013/04/11 21:52:09 ' ' * pos + '^' would also work. So would using w Show quoted text On 2013/04/08 16:41:20, sehr1 wrote: > This is a "cute" way of generating pos * ' '. Is this the best way to do this? ' ' * pos + '^' would also work. So would using width formats, but I ' '*pos is probably more common.
	223 # We decrement the line number since the array is 0 based while the

	224 # line numbers are 1 based.

	225 return "%s\n%s" % (self.lines[line - 1], caret)

	226

	227 def ErrorMessage(self, line, pos, msg):

	228 return "\n%s\n%s" % (

	229 self.FileLineMsg(line, msg),

	230 self.SourceLine(line, pos))

	231

	232 def GetTokens(self):

	233 outlist = []

	234 while 1:

	235 t = self.lexobj.token()

	236 if t is None:

	237 break

	238 outlist.append(t)

	239 return outlist

	240

	241 def __init__(self, filename, data):

	242 self.index = [0]

	243 self.lex_errors = 0

	244 self.lines = data.split('\n')

	245 self.lexobj = lex.lex(object=self, lextab=None, optimize=0)

	246 self.lexobj.filename = filename

	247 self.lexobj.input(data)

	248

	249

	250 #

	251 # FileToTokens

	252 #

	253 # From a source file generate a list of tokens.

	254 #

	255 def FileToTokens(filename):

	256 with open(filename, 'rb') as srcfile:

	257 lexer = IDLLexer(filename, srcfile.read())

	258 return lexer.GetTokens()

	259

	260

	261 #

	262 # TextToTokens

	263 #

	264 # From a source file generate a list of tokens.

	265 #

	266 def TextToTokens(text):

	267 lexer = IDLLexer(None, text)

	268 return lexer.GetTokens()

	269

	270

	271 #

	272 # TestSameText

	273 #

	274 # From a set of tokens, generate a new source text by joining with a

	275 # single space. The new source is then tokenized and compared against the
	sehr 2013/04/08 16:41:20 s/space/newline/ ? s/space/newline/ ? noelallen1 2013/04/11 21:52:09 Done. Show quoted text On 2013/04/08 16:41:20, sehr1 wrote: > s/space/newline/ ? Done.
	276 # old set.

	277 #

	278 def TestSameText(filename):

	279 tokens1 = FileToTokens(filename)

	280 tokens2 = TextToTokens('\n'.join(tokens1))

	281

	282 count1 = len(tokens1)

	283 count2 = len(tokens2)

	284 if count1 != count2:

	285 print "Size mismatch original %d vs %d\n" % (count1, count2)

	286 if count1 > count2:

	287 count1 = count2

	288

	289 failed = 0

	290 for i in range(count1):

	291 if tokens1[i] != tokens2[i]:

	292 print "%d >>%s<< >>%s<<" % (i, tokens1[i], tokens2[i])

	293 failed = failed + 1

	294

	295 return failed

	296

	297

	298 #

	299 # TestExpectedText

	300 #

	301 # From a set of tokens pairs, verify the type field of the second matches

	302 # the value of the first, so that:

	303 # integer 123 float 1.1

	304 # will generate a passing test, where the first token has both the type and

	305 # value of the keyword integer and the second has the type of integer and

	306 # value of 123.

	307 #

	308 def TestExpect(tokens):

	309 count = len(tokens)

	310 index = 0

	311 errors = 0

	312 while index < count:

	313 expect_type = tokens[index].value

	314 actual_type = tokens[index + 1].type

	315 index += 2

	316

	317 if expect_type != actual_type:

	318 sys.stderr.write('Mismatch: Expected %s, but got %s = %s.\n' %

	319 (expect_type, actual_type, token.value))

	320 errors += 1

	321

	322 return errors

	323

	324

	325 def Main(args):

	326 parser = optparse.OptionParser()

	327 parser.add_option('--test', help='Run tests.', action='store_true')

	328

	329 # If no arguments are provided, run tests.

	330 if len(args) == 0:

	331 args = ['--test', 'test_lexer/values.in', 'test_lexer/keywords.in']
	sehr 2013/04/08 16:41:20 This is sort of ugly, baking in test input paths. This is sort of ugly, baking in test input paths. Is this really how they want tests to be done? noelallen1 2013/04/11 21:52:09 This is fixed in a different CL On 2013/04/08 16:4 This is fixed in a different CL On 2013/04/08 16:41:20, sehr1 wrote: Show quoted text > This is sort of ugly, baking in test input paths. Is this really how they want > tests to be done?
	332

	333 global options

	334 options, filenames = parser.parse_args(args[1:])

	335

	336 if not filenames:

	337 parser.error('No files specified.')

	338

	339 for filename in filenames:

	340 try:

	341 tokens = FileToTokens(filename)

	342 values = [tok.value for tok in tokens]

	343 errors = 0

	344

	345 if options.test:

	346 if TestSameText(values):

	347 sys.stderr.write('Failed text match on %s.\n' % filename)

	348 return -1

	349 if TestExpect(tokens):

	350 sys.stderr.write('Failed expected type match on %s.\n' % filename)

	351 return -1

	352

	353 except lex.LexError as le:

	354 sys.stderr.write('%s\n' % str(le))

	355

	356 return 0

	357

	358

	359 if __name__ == '__main__':

	360 sys.exit(Main(sys.argv[1:]))

OLD	NEW

« no previous file with comments | « no previous file | tools/idl_parser/idl_log.py » ('j') | no next file with comments »