tools/lexer_generator/transition_keys.py - Issue 159753009: Experimental parser: break KeyEncoding off into its own file

Side by Side Diff: tools/lexer_generator/transition_keys.py

Issue 159753009: Experimental parser: break KeyEncoding off into its own file (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser

Patch Set: Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 # Copyright 2013 the V8 project authors. All rights reserved.	1 # Copyright 2013 the V8 project authors. All rights reserved.

2 # Redistribution and use in source and binary forms, with or without	2 # Redistribution and use in source and binary forms, with or without

3 # modification, are permitted provided that the following conditions are	3 # modification, are permitted provided that the following conditions are

4 # met:	4 # met:

5 #	5 #

6 # * Redistributions of source code must retain the above copyright	6 # * Redistributions of source code must retain the above copyright

7 # notice, this list of conditions and the following disclaimer.	7 # notice, this list of conditions and the following disclaimer.

8 # * Redistributions in binary form must reproduce the above	8 # * Redistributions in binary form must reproduce the above

9 # copyright notice, this list of conditions and the following	9 # copyright notice, this list of conditions and the following

10 # disclaimer in the documentation and/or other materials provided	10 # disclaimer in the documentation and/or other materials provided

11 # with the distribution.	11 # with the distribution.

12 # * Neither the name of Google Inc. nor the names of its	12 # * Neither the name of Google Inc. nor the names of its

13 # contributors may be used to endorse or promote products derived	13 # contributors may be used to endorse or promote products derived

14 # from this software without specific prior written permission.	14 # from this software without specific prior written permission.

15 #	15 #

16 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS	16 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

17 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT	17 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

18 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR	18 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

19 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT	19 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

20 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,	20 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT	21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,	22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY	23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT	24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE	25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.	26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

27	27

28 from types import IntType

29 from itertools import chain	28 from itertools import chain

	29 from encoding import KeyEncoding

30 from action import Term	30 from action import Term

31 from string import printable

32

33 class KeyEncoding(object):

34

35 __encodings = {}

36

37 @staticmethod

38 def get(name):

39 if not KeyEncoding.__encodings:

40 Latin1Encoding()

41 Utf16Encoding()

42 Utf8Encoding()

43 return KeyEncoding.__encodings[name]

44

45 def __init__(self, name, primary_range, named_ranges, predefined_ranges):

46 assert not name in KeyEncoding.__encodings

47 assert primary_range[0] <= primary_range[1]

48 KeyEncoding.__encodings[name] = self

49 self.__name = name

50 self.__primary_range = primary_range

51 self.__lower_bound = primary_range[0]

52 self.__upper_bound = primary_range[1]

53 self.__primary_range_component = self.numeric_range_term(primary_range[0],

54 primary_range[1])

55 self.__named_ranges = {

56 k : Term('NAMED_RANGE_KEY', k) for k in named_ranges }

57 def f(v):

58 if len(v) == 2:

59 return self.numeric_range_term(v[0], v[1])

60 elif len(v) == 1:

61 assert v[0] in self.__named_ranges

62 return self.__named_ranges[v[0]]

63 else:

64 raise Exception()

65 self.__predefined_ranges = {

66 k : map(f, v) for k, v in predefined_ranges.iteritems() }

67

68 def name(self):

69 return self.__name

70

71 def lower_bound(self):

72 return self.__lower_bound

73

74 def upper_bound(self):

75 return self.__upper_bound

76

77 def primary_range(self):

78 return self.__primary_range

79

80 def named_range(self, name):

81 ranges = self.__named_ranges

82 return Term.empty_term() if not name in ranges else ranges[name]

83

84 def named_range_iter(self):

85 return self.__named_range.iteritems()

86

87 def named_range_key_iter(self):

88 return self.__named_ranges.iterkeys()

89

90 def named_range_value_iter(self):

91 return self.__named_ranges.itervalues()

92

93 def predefined_range_iter(self, name):

94 ranges = self.__predefined_ranges

95 return None if not name in ranges else iter(ranges[name])

96

97 def __primary_range_iter(self):

98 yield self.__primary_range_component

99

100 def all_components_iter(self):

101 return chain(self.__primary_range_iter(), self.__named_ranges.itervalues())

102

103 def is_primary_range(self, r):

104 assert len(r) == 2

105 return self.in_primary_range(r[0], r[1])

106

107 def in_primary_range(self, a, b):

108 return self.lower_bound() <= a and b <= self.upper_bound()

109

110 def numeric_range_term(self, a, b):

111 assert type(a) == IntType and type(b) == IntType

112 assert self.in_primary_range(a, b)

113 return Term('NUMERIC_RANGE_KEY', a, b)

114	31

115 class TransitionKey(object):	32 class TransitionKey(object):

116 '''Represents a transition from a state in DFA or NFA to another state.	33 '''Represents a transition from a state in DFA or NFA to another state.

117	34

118 A transition key has a list of character ranges and a list of class ranges	35 A transition key has a list of character ranges and a list of class ranges

119 (e.g., "whitespace"), defining for which characters the transition	36 (e.g., "whitespace"), defining for which characters the transition

120 happens. When we generate code based on the transition key, the character	37 happens. When we generate code based on the transition key, the character

121 ranges generate simple checks and the class ranges generate more complicated	38 ranges generate simple checks and the class ranges generate more complicated

122 conditions, e.g., function calls.'''	39 conditions, e.g., function calls.'''

123	40

(...skipping 168 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
292	209

293 def __hash__(self):	210 def __hash__(self):

294 return hash(self.__term)	211 return hash(self.__term)

295	212

296 def __ne__(self, other):	213 def __ne__(self, other):

297 return not self == other	214 return not self == other

298	215

299 def __eq__(self, other):	216 def __eq__(self, other):

300 return isinstance(other, TransitionKey) and self.__term == other.__term	217 return isinstance(other, TransitionKey) and self.__term == other.__term

301	218

302 @staticmethod

303 def __class_name(encoding, r):

304 for name, v in encoding.class_range_iter():

305 if r == v:

306 return name

307 assert False

308

309 @staticmethod

310 def __unique_name(r):

311 for name, v in TransitionKey.__cached_keys['no_encoding'].items():

312 if v.__ranges and r == v.__ranges[0]:

313 return name[2:]

314 assert False

315

316 def range_iter(self, encoding):	219 def range_iter(self, encoding):

317 for c in self.__flatten():	220 for c in self.__flatten():

318 if c.name() == 'NUMERIC_RANGE_KEY':	221 if c.name() == 'NUMERIC_RANGE_KEY':

319 yield ('PRIMARY_RANGE', (c.args()[0], c.args()[1]))	222 yield ('PRIMARY_RANGE', (c.args()[0], c.args()[1]))

320 elif c.name() == 'NAMED_RANGE_KEY':	223 elif c.name() == 'NAMED_RANGE_KEY':

321 yield ('CLASS', c.args()[0])	224 yield ('CLASS', c.args()[0])

322 elif c.name() == 'TERM_KEY':	225 elif c.name() == 'TERM_KEY':

323 yield ('UNIQUE', c.args()[0])	226 yield ('UNIQUE', c.args()[0])

324 else:	227 else:

325 assert False, 'unimplemented %s' % c	228 assert False, 'unimplemented %s' % c

326	229

327 __printable_cache = {

328 ord('\t') : '\\t',

329 ord('\n') : '\\n',

330 ord('\r') : '\\r',

331 }

332

333 @staticmethod	230 @staticmethod

334 def __component_str(encoding, component):	231 def __component_str(encoding, component):

335 if component.name() == 'TERM_KEY':	232 if component.name() == 'TERM_KEY':

336 return component.args()[0]	233 return component.args()[0]

337 elif component.name() == 'NAMED_RANGE_KEY':	234 elif component.name() == 'NAMED_RANGE_KEY':

338 return component.args()[0]	235 return component.args()[0]

339 elif component.name() == 'EPSILON_KEY':	236 elif component.name() == 'EPSILON_KEY':

340 return 'epsilon'	237 return 'epsilon'

341 elif component.name() == 'OMEGA_KEY':	238 elif component.name() == 'OMEGA_KEY':

342 return 'omega'	239 return 'omega'

343 elif component.name() != 'NUMERIC_RANGE_KEY':	240 elif component.name() == 'NUMERIC_RANGE_KEY':

344 raise Exception('unprintable %s' % component)	241 r = component.args()

345 r = component.args()	242 to_str = lambda x: KeyEncoding.to_str(encoding, x)

346 def to_str(x):	243 if r[0] == r[1]:

347 assert not encoding or encoding.in_primary_range(x, x)	244 return '%s' % to_str(r[0])

348 if x > 127:

349 return str(x)

350 if not x in TransitionKey.__printable_cache:

351 res = "'%s'" % chr(x) if chr(x) in printable else str(x)

352 TransitionKey.__printable_cache[x] = res

353 return TransitionKey.__printable_cache[x]

354 if r[0] == r[1]:

355 return '%s' % to_str(r[0])

356 else:

357 return '[%s-%s]' % (to_str(r[0]), to_str(r[1]))	245 return '[%s-%s]' % (to_str(r[0]), to_str(r[1]))

	246 raise Exception('unprintable %s' % component)

358	247

359 def __flatten(self):	248 def __flatten(self):

360 return self.__flatten_components([self.__term])	249 return self.__flatten_components([self.__term])

361	250

362 @staticmethod	251 @staticmethod

363 def __flatten_components(components):	252 def __flatten_components(components):

364 for component in components:	253 for component in components:

365 if component.name() != 'COMPOSITE_KEY':	254 if component.name() != 'COMPOSITE_KEY':

366 yield component	255 yield component

367 else:	256 else:

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
409 (a, b, sign) = (b, a, 1) # normalize ordering so that a0 < b0	298 (a, b, sign) = (b, a, 1) # normalize ordering so that a0 < b0

410 assert a[0] < b[0]	299 assert a[0] < b[0]

411 if b[1] <= a[1]: # subset	300 if b[1] <= a[1]: # subset

412 return 3 * sign	301 return 3 * sign

413 if a[1] < b[0]: # non overlap	302 if a[1] < b[0]: # non overlap

414 return 2 * sign	303 return 2 * sign

415 return 5 * sign # partial overlap	304 return 5 * sign # partial overlap

416	305

417 @staticmethod	306 @staticmethod

418 def __construct(encoding, components):	307 def __construct(encoding, components):

	308 if isinstance(components, Term):

	309 components = [components]

419 is_unique = False	310 is_unique = False

420 acc = []	311 acc = []

421 last = Term.empty_term()	312 last = Term.empty_term()

422 for current in TransitionKey.__flatten_components(components):	313 for current in TransitionKey.__flatten_components(components):

423 name = current.name()	314 name = current.name()

424 if last:	315 if last:

425 assert name != 'EPSILON_KEY' and name != 'OMEGA_KEY', 'cannot merge'	316 assert name != 'EPSILON_KEY' and name != 'OMEGA_KEY', 'cannot merge'

426 c = TransitionKey.__component_compare(last, current)	317 c = TransitionKey.__component_compare(last, current)

427 assert c == -1 or c == -2, 'bad order %s %s' % (str(last), str(current))	318 assert c == -1 or c == -2, 'bad order %s %s' % (str(last), str(current))

428 if name == 'EPSILON_KEY' or name == 'OMEGA_KEY' or name == 'TERM_KEY':	319 if name == 'EPSILON_KEY' or name == 'OMEGA_KEY' or name == 'TERM_KEY':

429 pass	320 pass

430 elif name == 'NUMERIC_RANGE_KEY':	321 elif name == 'NUMERIC_RANGE_KEY':

431 assert encoding.is_primary_range(current.args())	322 assert encoding.is_primary_range(current.args())

432 if last.name() == 'NUMERIC_RANGE_KEY':	323 if last.name() == 'NUMERIC_RANGE_KEY':

433 assert last.args()[1] + 1 < current.args()[0], 'ranges must be merged'	324 assert last.args()[1] + 1 < current.args()[0], 'ranges must be merged'

434 elif name == 'NAMED_RANGE_KEY':	325 elif name == 'NAMED_RANGE_KEY':

435 assert encoding.named_range(current.args()[0])	326 assert encoding.named_range(current.args()[0])

436 else:	327 else:

437 raise Exception('illegal component %s' % str(current))	328 raise Exception('illegal component %s' % str(current))

438 acc.append(current)	329 acc.append(current)

439 last = current	330 last = current

440 assert acc, "must have components"	331 assert acc, "must have components"

441 return acc[0] if len(acc) == 1 else Term('COMPOSITE_KEY', *acc)	332 return acc[0] if len(acc) == 1 else Term('COMPOSITE_KEY', *acc)

442	333

443 def __init__(self, encoding, components):	334 def __init__(self, encoding, components):

444 if isinstance(components, Term):

445 components = [components]

446 self.__term = TransitionKey.__construct(encoding, components)	335 self.__term = TransitionKey.__construct(encoding, components)

447 self.__cached_hash = None	336 self.__cached_hash = None

448	337

449 def to_string(self, encoding):	338 def to_string(self, encoding):

450 return ', '.join(map(lambda x : TransitionKey.__component_str(encoding, x),	339 return ', '.join(map(lambda x : TransitionKey.__component_str(encoding, x),

451 self.__flatten()))	340 self.__flatten()))

452	341

453 def __str__(self):	342 def __str__(self):

454 return self.to_string(None)	343 return self.to_string(None)

455	344

(...skipping 96 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
552 return TransitionKey.__key_from_components(	441 return TransitionKey.__key_from_components(

553 encoding, True, TransitionKey.__flatten_keys(keys))	442 encoding, True, TransitionKey.__flatten_keys(keys))

554	443

555 @staticmethod	444 @staticmethod

556 def merged_key(encoding, keys):	445 def merged_key(encoding, keys):

557 return TransitionKey(encoding,	446 return TransitionKey(encoding,

558 TransitionKey.__disjoint_components_from_keys(encoding, keys, True))	447 TransitionKey.__disjoint_components_from_keys(encoding, keys, True))

559	448

560 @staticmethod	449 @staticmethod

561 def __invert_components(encoding, components):	450 def __invert_components(encoding, components):

562 def key(x, y):	451 key = lambda x, y: encoding.numeric_range_term(x, y)

563 return encoding.numeric_range_term(x, y)

564 last = None	452 last = None

565 classes = set(encoding.named_range_value_iter())	453 classes = set(encoding.named_range_value_iter())

566 for c in components:	454 for c in components:

567 if c in classes:	455 if c in classes:

568 classes.remove(c)	456 classes.remove(c)

569 continue	457 continue

570 assert c.name() == 'NUMERIC_RANGE_KEY', 'unencoded key not invertible'	458 assert c.name() == 'NUMERIC_RANGE_KEY', 'unencoded key not invertible'

571 r = c.args()	459 r = c.args()

572 assert encoding.is_primary_range(r)	460 assert encoding.is_primary_range(r)

573 if last == None:	461 if last == None:

574 if r[0] != encoding.lower_bound():	462 if r[0] != encoding.lower_bound():

575 yield key(encoding.lower_bound(), r[0] - 1)	463 yield key(encoding.lower_bound(), r[0] - 1)

576 elif last[1] + 1 < r[0]:	464 elif last[1] + 1 < r[0]:

577 yield key(last[1] + 1, r[0] - 1)	465 yield key(last[1] + 1, r[0] - 1)

578 last = r	466 last = r

579 upper_bound = encoding.primary_range()[1]	467 upper_bound = encoding.primary_range()[1]

580 if last == None:	468 if last == None:

581 r = encoding.primary_range()	469 r = encoding.primary_range()

582 yield key(r[0], r[1])	470 yield key(r[0], r[1])

583 elif last[1] < upper_bound:	471 elif last[1] < upper_bound:

584 yield key(last[1] + 1, upper_bound)	472 yield key(last[1] + 1, upper_bound)

585 for c in sorted(classes, TransitionKey.__component_compare):	473 for c in sorted(classes, TransitionKey.__component_compare):

586 yield c	474 yield c

587

588 class Latin1Encoding(KeyEncoding):

589

590 def __init__(self):

591 super(Latin1Encoding, self).__init__(

592 'latin1',

593 (0, 255),

594 [],

595 {

596 'whitespace':

597 [(9, 9), (11, 12), (32, 32), (133, 133), (160, 160)],

598 'letter':

599 [(65, 90), (97, 122), (170, 170), (181, 181),

600 (186, 186), (192, 214), (216, 246), (248, 255)],

601 'line_terminator':

602 [(10, 10), (13, 13)],

603 'identifier_part_not_letter':

604 [(48, 57), (95, 95)]

605 })

606

607 class Utf16Encoding(KeyEncoding):

608

609 def __init__(self):

610 super(Utf16Encoding, self).__init__(

611 'utf16',

612 (0, 255),

613 ['non_primary_whitespace',

614 'non_primary_letter',

615 'non_primary_identifier_part_not_letter',

616 'non_primary_line_terminator',

617 'non_primary_everything_else'],

618 {

619 'whitespace':

620 [(9, 9), (11, 12), (32, 32), (133, 133), (160, 160),

621 ('non_primary_whitespace',)],

622 'letter':

623 [(65, 90), (97, 122), (170, 170), (181, 181),

624 (186, 186), (192, 214), (216, 246), (248, 255),

625 ('non_primary_letter',)],

626 'line_terminator':

627 [(10, 10), (13, 13), ('non_primary_line_terminator',)],

628 'identifier_part_not_letter':

629 [(48, 57), (95, 95), ('non_primary_identifier_part_not_letter',)],

630 })

631

632 class Utf8Encoding(KeyEncoding):

633

634 def __init__(self):

635 super(Utf8Encoding, self).__init__(

636 'utf8',

637 (0, 127),

638 ['non_primary_whitespace',

639 'non_primary_letter',

640 'non_primary_identifier_part_not_letter',

641 'non_primary_line_terminator',

642 'non_primary_everything_else'],

643 {

644 'whitespace':

645 [(9, 9), (11, 12), (32, 32), ('non_primary_whitespace',)],

646 'letter':

647 [(65, 90), (97, 122), ('non_primary_letter',)],

648 'line_terminator':

649 [(10, 10), (13, 13), ('non_primary_line_terminator',)],

650 'identifier_part_not_letter':

651 [(48, 57), (95, 95), ('non_primary_identifier_part_not_letter',)],

652 })

OLD	NEW

« no previous file with comments | « tools/lexer_generator/encoding.py ('k') | no next file » | no next file with comments »