tools/lexer_generator/regex_parser.py - Issue 157013003: Experimental parser: cleanup RegexParser

Side by Side Diff: tools/lexer_generator/regex_parser.py

Issue 157013003: Experimental parser: cleanup RegexParser (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser

Patch Set: Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 # Copyright 2013 the V8 project authors. All rights reserved.	1 # Copyright 2013 the V8 project authors. All rights reserved.

2 # Redistribution and use in source and binary forms, with or without	2 # Redistribution and use in source and binary forms, with or without

3 # modification, are permitted provided that the following conditions are	3 # modification, are permitted provided that the following conditions are

4 # met:	4 # met:

5 #	5 #

6 # * Redistributions of source code must retain the above copyright	6 # * Redistributions of source code must retain the above copyright

7 # notice, this list of conditions and the following disclaimer.	7 # notice, this list of conditions and the following disclaimer.

8 # * Redistributions in binary form must reproduce the above	8 # * Redistributions in binary form must reproduce the above

9 # copyright notice, this list of conditions and the following	9 # copyright notice, this list of conditions and the following

10 # disclaimer in the documentation and/or other materials provided	10 # disclaimer in the documentation and/or other materials provided

11 # with the distribution.	11 # with the distribution.

12 # * Neither the name of Google Inc. nor the names of its	12 # * Neither the name of Google Inc. nor the names of its

13 # contributors may be used to endorse or promote products derived	13 # contributors may be used to endorse or promote products derived

14 # from this software without specific prior written permission.	14 # from this software without specific prior written permission.

15 #	15 #

16 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS	16 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

17 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT	17 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

18 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR	18 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

19 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT	19 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

20 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,	20 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT	21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,	22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY	23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT	24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE	25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.	26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

27	27

28 import ply.lex as lex	28 import ply.lex as lex

29 import ply.yacc as yacc	29 import ply.yacc as yacc

30 from types import ListType, TupleType

31 from regex_lexer import RegexLexer

32 from action import Term	30 from action import Term

	31 from nfa_builder import NfaBuilder

33	32

34 class ParserBuilder:	33 class ParserBuilder:

35	34

36 class Logger(object):	35 class Logger(object):

37 def debug(self,msg,args,*kwargs):	36 def debug(self,msg,args,*kwargs):

38 pass	37 pass

39	38

40 def info(self,msg,args,*kwargs):	39 def info(self,msg,args,*kwargs):

41 pass	40 pass

42	41

43 def warning(self,msg,args,*kwargs):	42 def warning(self,msg,args,*kwargs):

44 pass	43 raise Exception("warning: "+ (msg % args) + "\n")

45 # assert False, "warning: "+ (msg % args) + "\n"

46	44

47 def error(self,msg,args,*kwargs):	45 def error(self,msg,args,*kwargs):

48 assert False, "error: "+ (msg % args) + "\n"	46 raise Exception("error: "+ (msg % args) + "\n")

49	47

50 __static_instances = {}	48 __static_instances = {}

51 @staticmethod	49 @staticmethod

52 def parse(	50 def parse(

53 string, name, new_lexer, new_parser, preparse = None, postparse = None):	51 string, name, new_lexer, new_parser, preparse = None, postparse = None):

54 if not name in ParserBuilder.__static_instances:	52 if not name in ParserBuilder.__static_instances:

55 logger = ParserBuilder.Logger()	53 logger = ParserBuilder.Logger()

56 lexer_instance = new_lexer()	54 lexer_instance = new_lexer()

57 lexer_instance.lex = lex.lex(module=lexer_instance)	55 lexer_instance.lex = lex.lex(module=lexer_instance)

58 instance = new_parser()	56 instance = new_parser()

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
98 'REPEAT_BEGIN',	96 'REPEAT_BEGIN',

99 'REPEAT_END',	97 'REPEAT_END',

100	98

101 'NUMBER',	99 'NUMBER',

102 'COMMA',	100 'COMMA',

103 'LITERAL',	101 'LITERAL',

104	102

105 'RANGE',	103 'RANGE',

106 'NOT',	104 'NOT',

107 'CLASS_LITERAL',	105 'CLASS_LITERAL',

108 'CLASS_LITERAL_AS_OCTAL',

109 'CHARACTER_CLASS',	106 'CHARACTER_CLASS',

110 )	107 )

111	108

112 states = (	109 states = (

113 ('class','exclusive'),	110 ('class','exclusive'),

114 ('repeat','exclusive'),	111 ('repeat','exclusive'),

115 )	112 )

116	113

117 __escaped_literals = build_escape_map("(){}[]?+.*\|'\"\\")	114 __escaped_literals = build_escape_map("(){}[]?+.*\|'\"\\")

118	115

(...skipping 24 matching lines...) Expand all Loading...
143 r'\]'	140 r'\]'

144 self.lex.pop_state()	141 self.lex.pop_state()

145 return t	142 return t

146	143

147 t_class_RANGE = '-'	144 t_class_RANGE = '-'

148 t_class_NOT = '\^'	145 t_class_NOT = '\^'

149 t_class_CHARACTER_CLASS = r':\w+:'	146 t_class_CHARACTER_CLASS = r':\w+:'

150	147

151 def t_class_CLASS_LITERAL_AS_OCTAL(self, t):	148 def t_class_CLASS_LITERAL_AS_OCTAL(self, t):

152 r'\\\d+'	149 r'\\\d+'

	150 t.type = 'CLASS_LITERAL'

	151 t.value = chr(int(t.value[1:], 8))

153 return t	152 return t

154	153

155 __escaped_class_literals = build_escape_map("^[]-:\\")	154 __escaped_class_literals = build_escape_map("^[]-:\\")

156	155

157 def t_class_ESCAPED_CLASS_LITERAL(self, t):	156 def t_class_ESCAPED_CLASS_LITERAL(self, t):

158 r'\\.'	157 r'\\.'

159 t.type = 'CLASS_LITERAL'	158 t.type = 'CLASS_LITERAL'

160 t.value = RegexLexer.__escaped_class_literals[t.value]	159 t.value = RegexLexer.__escaped_class_literals[t.value]

161 return t	160 return t

162	161

(...skipping 18 matching lines...) Expand all Loading...
181 raise Exception("Illegal character '%s'" % t.value[0])	180 raise Exception("Illegal character '%s'" % t.value[0])

182	181

183 class RegexParser:	182 class RegexParser:

184	183

185 tokens = RegexLexer.tokens	184 tokens = RegexLexer.tokens

186	185

187 token_map = {	186 token_map = {

188 '+': 'ONE_OR_MORE',	187 '+': 'ONE_OR_MORE',

189 '?': 'ZERO_OR_ONE',	188 '?': 'ZERO_OR_ONE',

190 '*': 'ZERO_OR_MORE',	189 '*': 'ZERO_OR_MORE',

191 '\|': 'OR',

192 '.': 'ANY',	190 '.': 'ANY',

193 }	191 }

194	192

195 def p_start(self, p):	193 def p_start(self, p):

196 '''start : fragments OR fragments	194 '''start : fragments OR fragments

197 \| fragments'''	195 \| fragments'''

198 if len(p) == 2:	196 if len(p) == 2:

199 p[0] = p[1]	197 p[0] = p[1]

200 else:	198 else:

201 p[0] = Term(self.token_map[p[2]], p[1], p[3])	199 p[0] = NfaBuilder.or_terms([p[1], p[3]])

202	200

203 def p_fragments(self, p):	201 def p_fragments(self, p):

204 '''fragments : fragment	202 '''fragments : fragment

205 \| fragment fragments'''	203 \| fragment fragments'''

206 if len(p) == 2:	204 if len(p) == 2:

207 p[0] = p[1]	205 p[0] = p[1]

208 else:	206 else:

209 p[0] = self.__cat(p[1], p[2])	207 p[0] = self.__cat(p[1], p[2])

210	208

211 def p_fragment(self, p):	209 def p_fragment(self, p):

212 '''fragment : literal_array maybe_modifier	210 '''fragment : literal maybe_modifier

213 \| class maybe_modifier	211 \| class maybe_modifier

214 \| group maybe_modifier	212 \| group maybe_modifier

215 \| any maybe_modifier	213 \| any maybe_modifier

216 '''	214 '''

217 if p[2] != None:	215 if p[2]:

218 if isinstance(p[2], tuple) and p[2][0] == 'REPEAT':	216 if p[2][0] == 'REPEAT':

219 p[0] = Term(p[2][0], p[2][1], p[2][2], p[1])	217 p[0] = Term(p[2][0], p[2][1], p[2][2], p[1])

220 else:	218 else:

221 p[0] = Term(p[2], p[1])	219 p[0] = Term(p[2][0], p[1])

222 else:	220 else:

223 p[0] = p[1]	221 p[0] = p[1]

224	222

225 def p_maybe_modifier(self, p):	223 def p_maybe_modifier(self, p):

226 '''maybe_modifier : ONE_OR_MORE	224 '''maybe_modifier : ONE_OR_MORE

227 \| ZERO_OR_ONE	225 \| ZERO_OR_ONE

228 \| ZERO_OR_MORE	226 \| ZERO_OR_MORE

229 \| repetition	227 \| REPEAT_BEGIN NUMBER REPEAT_END

	228 \| REPEAT_BEGIN NUMBER COMMA NUMBER REPEAT_END

230 \| empty'''	229 \| empty'''

231 p[0] = p[1]

232 if p[1] in self.token_map:

233 p[0] = self.token_map[p[1]]

234

235 def p_repetition(self, p):

236 '''repetition : REPEAT_BEGIN NUMBER REPEAT_END

237 \| REPEAT_BEGIN NUMBER COMMA NUMBER REPEAT_END'''

238 if len(p) == 4:	230 if len(p) == 4:

239 p[0] = ("REPEAT", p[2], p[2])	231 p[0] = ("REPEAT", p[2], p[2])

	232 elif len(p) == 5:

	233 p[0] = ("REPEAT", p[2], p[4])

	234 elif p[1]:

	235 p[0] = (self.token_map[p[1]],)

240 else:	236 else:

241 p[0] = ("REPEAT", p[2], p[4])	237 p[0] = None

242	238

243 def p_literal_array(self, p):	239 def p_literal(self, p):

244 '''literal_array : literals'''	240 '''literal : LITERAL'''

245 p[0] = Term('LITERAL', ''.join(reversed(p[1])))	241 p[0] = Term('LITERAL', p[1])

246

247 def p_literals(self, p):

248 '''literals : LITERAL maybe_literals'''

249 if not p[2]:

250 p[0] = [p[1]]

251 else:

252 p[2].append(p[1])

253 p[0] = p[2]

254

255 def p_maybe_literals(self, p):

256 '''maybe_literals : literals

257 \| empty'''

258 p[0] = p[1]

259	242

260 def p_any(self, p):	243 def p_any(self, p):

261 '''any : ANY'''	244 '''any : ANY'''

262 p[0] = Term(self.token_map[p[1]])	245 p[0] = Term(self.token_map[p[1]])

263	246

264 def p_class(self, p):	247 def p_class(self, p):

265 '''class : CLASS_BEGIN class_content CLASS_END	248 '''class : CLASS_BEGIN class_content CLASS_END

266 \| CLASS_BEGIN NOT class_content CLASS_END'''	249 \| CLASS_BEGIN NOT class_content CLASS_END'''

267 if len(p) == 4:	250 if len(p) == 4:

268 p[0] = Term("CLASS", p[2])	251 p[0] = Term("CLASS", p[2])

269 else:	252 else:

270 p[0] = Term("NOT_CLASS", p[3])	253 p[0] = Term("NOT_CLASS", p[3])

271	254

272 def p_group(self, p):	255 def p_group(self, p):

273 '''group : GROUP_BEGIN start GROUP_END'''	256 '''group : GROUP_BEGIN start GROUP_END'''

274 p[0] = p[2]	257 p[0] = p[2]

275	258

276 def p_class_content(self, p):	259 def p_class_content(self, p):

277 '''class_content : CLASS_LITERAL RANGE CLASS_LITERAL maybe_class_content	260 '''class_content : CLASS_LITERAL RANGE CLASS_LITERAL maybe_class_content

278 \| CLASS_LITERAL maybe_class_content	261 \| CLASS_LITERAL maybe_class_content

279 \| CHARACTER_CLASS maybe_class_content	262 \| CHARACTER_CLASS maybe_class_content

280 \| CLASS_LITERAL_AS_OCTAL maybe_class_content

281 '''	263 '''

282 if len(p) == 5:	264 if len(p) == 5:

283 left = Term("RANGE", p[1], p[3])	265 left = Term("RANGE", p[1], p[3])

284 else:	266 else:

285 if len(p[1]) == 1:	267 if len(p[1]) == 1:

286 left = Term('LITERAL', p[1])	268 left = Term('LITERAL', p[1])

287 elif p[1][0] == '\\':

288 left = Term('LITERAL', chr(int(p[1][1:], 8)))

289 else:	269 else:

290 left = Term('CHARACTER_CLASS', p[1][1:-1])	270 left = Term('CHARACTER_CLASS', p[1][1:-1])

291 p[0] = self.__cat(left, p[len(p)-1])	271 p[0] = self.__cat(left, p[len(p)-1])

292	272

293 def p_maybe_class_content(self, p):	273 def p_maybe_class_content(self, p):

294 '''maybe_class_content : class_content	274 '''maybe_class_content : class_content

295 \| empty'''	275 \| empty'''

296 p[0] = p[1]	276 p[0] = p[1]

297	277

298 def p_empty(self, p):	278 def p_empty(self, p):

299 'empty :'	279 'empty :'

300	280

301 def p_error(self, p):	281 def p_error(self, p):

302 raise Exception("Syntax error in input '%s'" % str(p))	282 raise Exception("Syntax error in input '%s'" % str(p))

303	283

304 @staticmethod	284 @staticmethod

305 def __cat(left, right):	285 def __cat(left, right):

306 assert left	286 assert left

307 return left if not right else Term('CAT', left, right)	287 return NfaBuilder.cat_terms([left] if not right else [left, right])

308	288

309 @staticmethod	289 @staticmethod

310 def parse(string):	290 def parse(string):

311 new_lexer = lambda: RegexLexer()	291 new_lexer = lambda: RegexLexer()

312 new_parser = lambda: RegexParser()	292 new_parser = lambda: RegexParser()

313 return ParserBuilder.parse(string, "RegexParser", new_lexer, new_parser)	293 return ParserBuilder.parse(string, "RegexParser", new_lexer, new_parser)

OLD	NEW

« no previous file with comments | « tools/lexer_generator/nfa_builder.py ('k') | tools/lexer_generator/transition_keys.py » ('j') | no next file with comments »