tools/lexer_generator/transition_keys.py - Issue 169523003: Experimental parser: split and rename some files

Side by Side Diff: tools/lexer_generator/transition_keys.py

Issue 169523003: Experimental parser: split and rename some files (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser

Patch Set: Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 # Copyright 2013 the V8 project authors. All rights reserved.

2 # Redistribution and use in source and binary forms, with or without

3 # modification, are permitted provided that the following conditions are

4 # met:

5 #

6 # * Redistributions of source code must retain the above copyright

7 # notice, this list of conditions and the following disclaimer.

8 # * Redistributions in binary form must reproduce the above

9 # copyright notice, this list of conditions and the following

10 # disclaimer in the documentation and/or other materials provided

11 # with the distribution.

12 # * Neither the name of Google Inc. nor the names of its

13 # contributors may be used to endorse or promote products derived

14 # from this software without specific prior written permission.

15 #

16 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

17 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

18 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

19 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

20 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

27

28 from itertools import chain

29 from encoding import KeyEncoding

30 from action import Term

31

32 class TransitionKey(object):

33 '''Represents a transition from a state in DFA or NFA to another state.

34

35 A transition key has a list of character ranges and a list of class ranges

36 (e.g., "whitespace"), defining for which characters the transition

37 happens. When we generate code based on the transition key, the character

38 ranges generate simple checks and the class ranges generate more complicated

39 conditions, e.g., function calls.'''

40

41 __cached_keys = {

42 'no_encoding' : {},

43 'latin1' : {},

44 'utf8' : {},

45 'utf16' : {},

46 }

47

48 @staticmethod

49 def __cached_key(encoding, name, components_getter):

50 encoding_name = encoding.name() if encoding else 'no_encoding'

51 cache = TransitionKey.__cached_keys[encoding_name]

52 if not name in cache:

53 cache[name] = TransitionKey(encoding, components_getter())

54 return cache[name]

55

56 @staticmethod

57 def epsilon():

58 '''Returns a TransitionKey for the epsilon (empty) transition.'''

59 return TransitionKey.__cached_key(None, 'epsilon',

60 lambda : Term("EPSILON_KEY"))

61

62 @staticmethod

63 def omega():

64 '''Always matches.'''

65 return TransitionKey.__cached_key(None, 'omega', lambda : Term("OMEGA_KEY"))

66

67 @staticmethod

68 def any(encoding):

69 '''Returns a TransitionKey which matches any encoded character.'''

70 return TransitionKey.__cached_key(encoding, 'any',

71 lambda : encoding.all_components_iter())

72

73 @staticmethod

74 def single_char(encoding, char):

75 '''Returns a TransitionKey for a single-character transition.'''

76 return TransitionKey.range(encoding, char, char)

77

78 @staticmethod

79 def range(encoding, a, b):

80 '''Returns a TransitionKey for a single-character transition.'''

81 return TransitionKey(encoding, encoding.numeric_range_term(a, b))

82

83 @staticmethod

84 def unique(term): # TODO(dcarney): rename

85 '''Returns a unique TransitionKey for the given name (and creates it if it

86 doesn't exist yet). The TransitionKey won't have any real character range,

87 but a newly-created "mock" character range which is separate from all other

88 character ranges.'''

89 return TransitionKey(None, Term("TERM_KEY", term))

90

91 @staticmethod

92 def __process_term(encoding, term, components, key_map):

93 key = term.name()

94 args = term.args()

95 if key == 'RANGE':

96 components.append(encoding.numeric_range_term(args[0], args[1]))

97 elif key == 'LITERAL':

98 components += map(lambda x : encoding.numeric_range_term(x, x), args)

99 elif key == 'CAT':

100 for x in args:

101 TransitionKey.__process_term(encoding, x, components, key_map)

102 elif key == 'CHARACTER_CLASS':

103 class_name = args[0]

104 if encoding.named_range(class_name):

105 c = encoding.named_range(class_name)

106 if class_name in key_map:

107 assert key_map[class_name] == TransitionKey(encoding, c)

108 components.append(c)

109 elif encoding.predefined_range_iter(class_name):

110 cs = list(encoding.predefined_range_iter(class_name))

111 if class_name in key_map:

112 assert key_map[class_name] == TransitionKey(encoding, cs)

113 components += cs

114 elif class_name in key_map:

115 components += key_map[class_name].__flatten()

116 else:

117 raise Exception('unknown character class [%s]' % args[0])

118 else:

119 raise Exception('bad key [%s]' % key)

120

121 @staticmethod

122 def character_class(encoding, term, key_map):

123 '''Processes 'term' (a representation of a character class in the rule

124 file), and constructs a TransitionKey based on it. 'key_map' contains

125 already constructed aliases for character classes (they can be used in the

126 new character class). It is a map from strings (character class names) to

127 TransitionKeys. For example, graph might represent the character class

128 [a-z:digit:] where 'digit' is a previously constructed character class found

129 in "key_map".'''

130 components = []

131 assert term.name() == 'CLASS' or term.name() == 'NOT_CLASS'

132 invert = term.name() == 'NOT_CLASS'

133 assert len(term.args()) == 1

134 TransitionKey.__process_term(encoding, term.args()[0], components, key_map)

135 key = TransitionKey.__key_from_components(encoding, invert, components)

136 assert key != None, "not invertible %s " % str(term)

137 return key

138

139 def matches_char(self, char):

140 'this is just for simple lexer testing and is incomplete'

141 char = ord(char)

142 assert 0 <= char and char < 128

143 for c in self.__flatten():

144 if c.name() == 'NUMERIC_RANGE_KEY':

145 r = c.args()

146 if r[0] <= char and char <= r[1]:

147 return True

148 return False

149

150 # (disjoint, subset, advance_left, advance_right)

151 __is_superset_of_key_helper = (

152 (True, True, False, False), # -5 : error

153 (True, True, False, False), # -4 : error

154 (False, True, False, True), # -3 :

155 (False, False, True, False), # -2 :

156 (False, False, True, False), # -1 :

157 (False, True, True, True), # 0 :

158 (True, False, False, True), # 1 :

159 (True, False, False, True), # 2 :

160 (True, True, False, False), # 3 : error

161 (False, True, False, True), # 4 :

162 (True, True, False, False), # 5 : error

163 )

164

165 def is_superset_of_key(self, key):

166 '''Returns true if 'key' is a sub-key of this TransitionKey.

167 must be called on a key that is either a subset or disjoint'''

168 helper = TransitionKey.__is_superset_of_key_helper

169 (left, right) = (self.__flatten(), key.__flatten())

170 (disjoint, subset, advance_left, advance_right) = (False, False, True, True)

171 (right_exhausted, left_exhausted) = (False, False)

172 while advance_left or advance_right:

173 if advance_right:

174 try:

175 r = right.next()

176 except StopIteration:

177 right_exhausted = True

178 if advance_left:

179 try:

180 l = left.next()

181 except StopIteration:

182 left_exhausted = True

183 if right_exhausted or left_exhausted:

184 break

185 c = TransitionKey.__component_compare(l, r)

186 (d, s, advance_left, advance_right) = helper[c + 5]

187 disjoint \|= d

188 subset \|= s

189 if not right_exhausted:

190 disjoint = True

191 if disjoint and subset:

192 raise Exception('not a subset and not disjoint')

193 return subset

194

195 @staticmethod

196 def compare(self, other):

197 left = list(self.__flatten())

198 right = list(other.__flatten())

199 offset = 0

200 while offset < len(left) and offset < len(right):

201 c = TransitionKey.__component_compare(left[offset], right[offset])

202 if c != 0:

203 return c

204 offset += 1

205 return TransitionKey.__cmp(len(left), len(right))

206

207 def __cmp__(self, other):

208 return TransitionKey.compare(self, other)

209

210 def __hash__(self):

211 return hash(self.__term)

212

213 def __ne__(self, other):

214 return not self == other

215

216 def __eq__(self, other):

217 return isinstance(other, TransitionKey) and self.__term == other.__term

218

219 def range_iter(self, encoding):

220 for c in self.__flatten():

221 if c.name() == 'NUMERIC_RANGE_KEY':

222 yield ('PRIMARY_RANGE', (c.args()[0], c.args()[1]))

223 elif c.name() == 'NAMED_RANGE_KEY':

224 yield ('CLASS', c.args()[0])

225 elif c.name() == 'TERM_KEY':

226 yield ('UNIQUE', c.args()[0])

227 elif c.name() == 'OMEGA_KEY':

228 yield ('OMEGA', ())

229 else:

230 assert False, 'unimplemented %s' % c

231

232 @staticmethod

233 def __component_str(encoding, component):

234 if component.name() == 'TERM_KEY':

235 return component.args()[0]

236 elif component.name() == 'NAMED_RANGE_KEY':

237 return component.args()[0]

238 elif component.name() == 'EPSILON_KEY':

239 return 'epsilon'

240 elif component.name() == 'OMEGA_KEY':

241 return 'omega'

242 elif component.name() == 'NUMERIC_RANGE_KEY':

243 r = component.args()

244 to_str = lambda x: KeyEncoding.to_str(encoding, x)

245 if r[0] == r[1]:

246 return "'%s'" % to_str(r[0])

247 return "['%s'-'%s']" % (to_str(r[0]), to_str(r[1]))

248 raise Exception('unprintable %s' % component)

249

250 def __flatten(self):

251 return self.__flatten_components([self.__term])

252

253 @staticmethod

254 def __flatten_components(components):

255 for component in components:

256 if component.name() != 'COMPOSITE_KEY':

257 yield component

258 else:

259 for arg in component.args():

260 yield arg

261

262 __component_name_order = {

263 'EPSILON_KEY' : 0,

264 'NUMERIC_RANGE_KEY' : 1,

265 'NAMED_RANGE_KEY' : 2,

266 'TERM_KEY' : 3,

267 'OMEGA_KEY' : 4

268 }

269

270 @staticmethod

271 def __cmp(a, b):

272 'wraps standard cmp function to return correct results for components'

273 c = cmp(a, b)

274 return 0 if c == 0 else (-1 if c < 0 else 1)

275

276 @staticmethod

277 def __component_name_compare(a, b):

278 return TransitionKey.__cmp(TransitionKey.__component_name_order[a],

279 TransitionKey.__component_name_order[b])

280

281 @staticmethod

282 def __component_compare(a, b):

283 '''component-wise compare function, returns the following values when

284 comparing non identical numerical ranges:

285 non-overlapping : -2 -- a0 <= a1 < b0 <= b1

286 b subset of a : -3 -- a0 < b0 <= b1 <= a1

287 a subset of b : -4 -- a0 == b0 and a1 < b1

288 a overlaps to left : -5 -- a0 < b0 and b0 <= a1 < b1

289 otherwise a value in [-1, 1] is returned'''

290 if a.name() != b.name():

291 return TransitionKey.__component_name_compare(a.name(), b.name())

292 if a.name() != 'NUMERIC_RANGE_KEY':

293 return TransitionKey.__cmp(str(a), str(b))

294 (a, b) = (a.args(), b.args())

295 c0 = TransitionKey.__cmp(a[0], b[0])

296 if c0 == 0:

297 return 4 * TransitionKey.__cmp(a[1], b[1]) # either == or a subset

298 sign = -1

299 if c0 > 0:

300 (a, b, sign) = (b, a, 1) # normalize ordering so that a0 < b0

301 assert a[0] < b[0]

302 if b[1] <= a[1]: # subset

303 return 3 * sign

304 if a[1] < b[0]: # non overlap

305 return 2 * sign

306 return 5 * sign # partial overlap

307

308 @staticmethod

309 def __is_composable(term):

310 return term.name() != 'EPSILON_KEY' and term.name() != 'OMEGA_KEY'

311

312 @staticmethod

313 def __construct(encoding, components):

314 if isinstance(components, Term):

315 components = [components]

316 is_unique = False

317 acc = []

318 last = Term.empty_term()

319 for current in TransitionKey.__flatten_components(components):

320 name = current.name()

321 # verify arguments

322 if name == 'EPSILON_KEY' or name == 'OMEGA_KEY' or name == 'TERM_KEY':

323 pass

324 elif name == 'NUMERIC_RANGE_KEY':

325 assert encoding.is_primary_range(current.args())

326 if last.name() == 'NUMERIC_RANGE_KEY':

327 assert last.args()[1] + 1 < current.args()[0], 'ranges must be merged'

328 elif name == 'NAMED_RANGE_KEY':

329 assert encoding.named_range(current.args()[0])

330 else:

331 raise Exception('illegal component %s' % str(current))

332 # verify ordering, composability

333 if last:

334 assert TransitionKey.__is_composable(current), 'cannot compose'

335 if len(acc) == 1:

336 assert TransitionKey.__is_composable(last), 'cannot compose'

337 c = TransitionKey.__component_compare(last, current)

338 assert c == -1 or c == -2, 'bad order %s %s' % (str(last), str(current))

339 acc.append(current)

340 last = current

341 assert acc, "must have components"

342 return acc[0] if len(acc) == 1 else Term('COMPOSITE_KEY', *acc)

343

344 def __init__(self, encoding, components):

345 self.__term = TransitionKey.__construct(encoding, components)

346 self.__cached_hash = None

347

348 def to_string(self, encoding):

349 return ', '.join(map(lambda x : TransitionKey.__component_str(encoding, x),

350 self.__flatten()))

351

352 def __str__(self):

353 return self.to_string(None)

354

355 @staticmethod

356 def __disjoint_keys(encoding, range_map):

357 '''Takes a set of possibly overlapping ranges, returns a list of ranges

358 which don't overlap and which cover the same points as the original

359 set. range_map is a map from lower bounds to a list of upper bounds.'''

360 sort = lambda x : sorted(set(x))

361 range_map = sorted(map(lambda (k, v): (k, sort(v)), range_map.items()))

362 upper_bound = encoding.upper_bound() + 1

363 for i in range(len(range_map)):

364 (left, left_values) = range_map[i]

365 next = range_map[i + 1][0] if i != len(range_map) - 1 else upper_bound

366 to_push = []

367 for v in left_values:

368 assert left <= next

369 if v >= next:

370 if not to_push and left < next:

371 yield (left, next - 1)

372 to_push.append(v)

373 else:

374 yield (left, v)

375 left = v + 1

376 if to_push:

377 current = range_map[i + 1]

378 range_map[i + 1] = (current[0], sort(current[1] + to_push))

379

380 @staticmethod

381 def __merge_ranges(encoding, ranges):

382 last = None

383 for r in ranges:

384 if last == None:

385 last = r

386 elif last[1] + 1 == r[0]:

387 last = (last[0], r[1])

388 else:

389 yield last

390 last = r

391 if last != None:

392 yield last

393

394 @staticmethod

395 def __flatten_keys(keys):

396 return chain(*map(lambda x : x.__flatten(), keys))

397

398 @staticmethod

399 def __disjoint_components_from_keys(encoding, keys, merge_ranges = False):

400 return TransitionKey.__disjoint_components(

401 encoding, TransitionKey.__flatten_keys(keys), merge_ranges)

402

403 @staticmethod

404 def __disjoint_components(encoding, components, merge_ranges):

405 range_map = {}

406 other_keys = set([])

407 for x in components:

408 if x.name() != 'NUMERIC_RANGE_KEY':

409 other_keys.add(x)

410 continue

411 (start, end) = x.args()

412 if not start in range_map:

413 range_map[start] = []

414 range_map[start].append(end)

415 ranges = TransitionKey.__disjoint_keys(encoding, range_map)

416 if merge_ranges:

417 ranges = TransitionKey.__merge_ranges(encoding, ranges)

418 other_keys = sorted(other_keys, cmp=TransitionKey.__component_compare)

419 range_terms = map(

420 lambda x : encoding.numeric_range_term(x[0], x[1]), ranges)

421 return chain(iter(range_terms), iter(other_keys))

422

423 @staticmethod

424 def __key_from_components(encoding, invert, components):

425 components = TransitionKey.__disjoint_components(encoding, components, True)

426 if invert:

427 components = TransitionKey.__invert_components(encoding, components)

428 return None if not components else TransitionKey(encoding, components)

429

430 @staticmethod

431 def disjoint_keys(encoding, keys):

432 '''Takes a set of possibly overlapping TransitionKeys, returns a list of

433 TransitionKeys which don't overlap and whose union is the same as the union

434 of the original key_set. In addition, TransitionKeys are not merged, only

435 split.

436

437 For example, if key_set contains two TransitionKeys for ranges [1-10] and

438 [5-15], disjoint_keys returns a set of three TransitionKeys: [1-4], [5-10],

439 [11-16].'''

440 return map(lambda x : TransitionKey(encoding, x),

441 TransitionKey.__disjoint_components_from_keys(encoding, keys))

442

443 @staticmethod

444 def inverse_key(encoding, keys):

445 '''Returns a TransitionKey which matches represents the inverse of the union

446 of 'keys'. The TransitionKeys contain a set of character ranges and a set

447 of classes. The character ranges are inverted in relation to the latin_1

448 character range, and the character classes are inverted in relation to all

449 character classes in __class_bounds.'''

450 return TransitionKey.__key_from_components(

451 encoding, True, TransitionKey.__flatten_keys(keys))

452

453 @staticmethod

454 def merged_key(encoding, keys):

455 return TransitionKey(encoding,

456 TransitionKey.__disjoint_components_from_keys(encoding, keys, True))

457

458 @staticmethod

459 def __invert_components(encoding, components):

460 key = lambda x, y: encoding.numeric_range_term(x, y)

461 last = None

462 classes = set(encoding.named_range_value_iter())

463 for c in components:

464 if c in classes:

465 classes.remove(c)

466 continue

467 assert c.name() == 'NUMERIC_RANGE_KEY', 'unencoded key not invertible'

468 r = c.args()

469 assert encoding.is_primary_range(r)

470 if last == None:

471 if r[0] != encoding.lower_bound():

472 yield key(encoding.lower_bound(), r[0] - 1)

473 elif last[1] + 1 < r[0]:

474 yield key(last[1] + 1, r[0] - 1)

475 last = r

476 upper_bound = encoding.primary_range()[1]

477 if last == None:

478 r = encoding.primary_range()

479 yield key(r[0], r[1])

480 elif last[1] < upper_bound:

481 yield key(last[1] + 1, upper_bound)

482 for c in sorted(classes, TransitionKey.__component_compare):

483 yield c

OLD	NEW

« no previous file with comments | « tools/lexer_generator/transition_key.py ('k') | no next file » | no next file with comments »