tools/lexer_generator/transition_keys.py - Issue 169523003: Experimental parser: split and rename some files

Unified Diff: tools/lexer_generator/transition_keys.py

Issue 169523003: Experimental parser: split and rename some files (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser

Patch Set: Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/lexer_generator/transition_keys.py

diff --git a/tools/lexer_generator/transition_keys.py b/tools/lexer_generator/transition_keys.py

deleted file mode 100644

index 2fabd7bcb31de70c8e5823f68e8f7d1766215223..0000000000000000000000000000000000000000

--- a/tools/lexer_generator/transition_keys.py

+++ /dev/null

@@ -1,483 +0,0 @@

-# Redistribution and use in source and binary forms, with or without

-# modification, are permitted provided that the following conditions are

-# met:

-# * Redistributions of source code must retain the above copyright

-# notice, this list of conditions and the following disclaimer.

-# * Redistributions in binary form must reproduce the above

-# copyright notice, this list of conditions and the following

-# disclaimer in the documentation and/or other materials provided

-# with the distribution.

-# * Neither the name of Google Inc. nor the names of its

-# contributors may be used to endorse or promote products derived

-# from this software without specific prior written permission.

-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-from itertools import chain

-from encoding import KeyEncoding

-from action import Term

-class TransitionKey(object):

- '''Represents a transition from a state in DFA or NFA to another state.

- A transition key has a list of character ranges and a list of class ranges

- (e.g., "whitespace"), defining for which characters the transition

- happens. When we generate code based on the transition key, the character

- ranges generate simple checks and the class ranges generate more complicated

- conditions, e.g., function calls.'''

- __cached_keys = {

- 'no_encoding' : {},

- 'latin1' : {},

- 'utf8' : {},

- 'utf16' : {},

- }

- @staticmethod

- def __cached_key(encoding, name, components_getter):

- encoding_name = encoding.name() if encoding else 'no_encoding'

- cache = TransitionKey.__cached_keys[encoding_name]

- if not name in cache:

- cache[name] = TransitionKey(encoding, components_getter())

- return cache[name]

- @staticmethod

- def epsilon():

- '''Returns a TransitionKey for the epsilon (empty) transition.'''

- return TransitionKey.__cached_key(None, 'epsilon',

- lambda : Term("EPSILON_KEY"))

- @staticmethod

- def omega():

- '''Always matches.'''

- return TransitionKey.__cached_key(None, 'omega', lambda : Term("OMEGA_KEY"))

- @staticmethod

- def any(encoding):

- '''Returns a TransitionKey which matches any encoded character.'''

- return TransitionKey.__cached_key(encoding, 'any',

- lambda : encoding.all_components_iter())

- @staticmethod

- def single_char(encoding, char):

- '''Returns a TransitionKey for a single-character transition.'''

- return TransitionKey.range(encoding, char, char)

- @staticmethod

- def range(encoding, a, b):

- '''Returns a TransitionKey for a single-character transition.'''

- return TransitionKey(encoding, encoding.numeric_range_term(a, b))

- @staticmethod

- def unique(term): # TODO(dcarney): rename

- '''Returns a unique TransitionKey for the given name (and creates it if it

- doesn't exist yet). The TransitionKey won't have any real character range,

- but a newly-created "mock" character range which is separate from all other

- character ranges.'''

- return TransitionKey(None, Term("TERM_KEY", term))

- @staticmethod

- def __process_term(encoding, term, components, key_map):

- key = term.name()

- args = term.args()

- if key == 'RANGE':

- components.append(encoding.numeric_range_term(args[0], args[1]))

- elif key == 'LITERAL':

- components += map(lambda x : encoding.numeric_range_term(x, x), args)

- elif key == 'CAT':

- for x in args:

- TransitionKey.__process_term(encoding, x, components, key_map)

- elif key == 'CHARACTER_CLASS':

- class_name = args[0]

- if encoding.named_range(class_name):

- c = encoding.named_range(class_name)

- if class_name in key_map:

- assert key_map[class_name] == TransitionKey(encoding, c)

- components.append(c)

- elif encoding.predefined_range_iter(class_name):

- cs = list(encoding.predefined_range_iter(class_name))

- if class_name in key_map:

- assert key_map[class_name] == TransitionKey(encoding, cs)

- components += cs

- elif class_name in key_map:

- components += key_map[class_name].__flatten()

- else:

- raise Exception('unknown character class [%s]' % args[0])

- else:

- raise Exception('bad key [%s]' % key)

- @staticmethod

- def character_class(encoding, term, key_map):

- '''Processes 'term' (a representation of a character class in the rule

- file), and constructs a TransitionKey based on it. 'key_map' contains

- already constructed aliases for character classes (they can be used in the

- new character class). It is a map from strings (character class names) to

- TransitionKeys. For example, graph might represent the character class

- [a-z:digit:] where 'digit' is a previously constructed character class found

- in "key_map".'''

- components = []

- assert term.name() == 'CLASS' or term.name() == 'NOT_CLASS'

- invert = term.name() == 'NOT_CLASS'

- assert len(term.args()) == 1

- TransitionKey.__process_term(encoding, term.args()[0], components, key_map)

- key = TransitionKey.__key_from_components(encoding, invert, components)

- assert key != None, "not invertible %s " % str(term)

- return key

- def matches_char(self, char):

- 'this is just for simple lexer testing and is incomplete'

- char = ord(char)

- assert 0 <= char and char < 128

- for c in self.__flatten():

- if c.name() == 'NUMERIC_RANGE_KEY':

- r = c.args()

- if r[0] <= char and char <= r[1]:

- return True

- return False

- # (disjoint, subset, advance_left, advance_right)

- __is_superset_of_key_helper = (

- (True, True, False, False), # -5 : error

- (True, True, False, False), # -4 : error

- (False, True, False, True), # -3 :

- (False, False, True, False), # -2 :

- (False, False, True, False), # -1 :

- (False, True, True, True), # 0 :

- (True, False, False, True), # 1 :

- (True, False, False, True), # 2 :

- (True, True, False, False), # 3 : error

- (False, True, False, True), # 4 :

- (True, True, False, False), # 5 : error

- )

- def is_superset_of_key(self, key):

- '''Returns true if 'key' is a sub-key of this TransitionKey.

- must be called on a key that is either a subset or disjoint'''

- helper = TransitionKey.__is_superset_of_key_helper

- (left, right) = (self.__flatten(), key.__flatten())

- (disjoint, subset, advance_left, advance_right) = (False, False, True, True)

- (right_exhausted, left_exhausted) = (False, False)

- while advance_left or advance_right:

- if advance_right:

- try:

- r = right.next()

- except StopIteration:

- right_exhausted = True

- if advance_left:

- try:

- l = left.next()

- except StopIteration:

- left_exhausted = True

- if right_exhausted or left_exhausted:

- break

- c = TransitionKey.__component_compare(l, r)

- (d, s, advance_left, advance_right) = helper[c + 5]

- disjoint |= d

- subset |= s

- if not right_exhausted:

- disjoint = True

- if disjoint and subset:

- raise Exception('not a subset and not disjoint')

- return subset

- @staticmethod

- def compare(self, other):

- left = list(self.__flatten())

- right = list(other.__flatten())

- offset = 0

- while offset < len(left) and offset < len(right):

- c = TransitionKey.__component_compare(left[offset], right[offset])

- if c != 0:

- return c

- offset += 1

- return TransitionKey.__cmp(len(left), len(right))

- def __cmp__(self, other):

- return TransitionKey.compare(self, other)

- def __hash__(self):

- return hash(self.__term)

- def __ne__(self, other):

- return not self == other

- def __eq__(self, other):

- return isinstance(other, TransitionKey) and self.__term == other.__term

- def range_iter(self, encoding):

- for c in self.__flatten():

- if c.name() == 'NUMERIC_RANGE_KEY':

- yield ('PRIMARY_RANGE', (c.args()[0], c.args()[1]))

- elif c.name() == 'NAMED_RANGE_KEY':

- yield ('CLASS', c.args()[0])

- elif c.name() == 'TERM_KEY':

- yield ('UNIQUE', c.args()[0])

- elif c.name() == 'OMEGA_KEY':

- yield ('OMEGA', ())

- else:

- assert False, 'unimplemented %s' % c

- @staticmethod

- def __component_str(encoding, component):

- if component.name() == 'TERM_KEY':

- return component.args()[0]

- elif component.name() == 'NAMED_RANGE_KEY':

- return component.args()[0]

- elif component.name() == 'EPSILON_KEY':

- return 'epsilon'

- elif component.name() == 'OMEGA_KEY':

- return 'omega'

- elif component.name() == 'NUMERIC_RANGE_KEY':

- r = component.args()

- to_str = lambda x: KeyEncoding.to_str(encoding, x)

- if r[0] == r[1]:

- return "'%s'" % to_str(r[0])

- return "['%s'-'%s']" % (to_str(r[0]), to_str(r[1]))

- raise Exception('unprintable %s' % component)

- def __flatten(self):

- return self.__flatten_components([self.__term])

- @staticmethod

- def __flatten_components(components):

- for component in components:

- if component.name() != 'COMPOSITE_KEY':

- yield component

- else:

- for arg in component.args():

- yield arg

- __component_name_order = {

- 'EPSILON_KEY' : 0,

- 'NUMERIC_RANGE_KEY' : 1,

- 'NAMED_RANGE_KEY' : 2,

- 'TERM_KEY' : 3,

- 'OMEGA_KEY' : 4

- }

- @staticmethod

- def __cmp(a, b):

- 'wraps standard cmp function to return correct results for components'

- c = cmp(a, b)

- return 0 if c == 0 else (-1 if c < 0 else 1)

- @staticmethod

- def __component_name_compare(a, b):

- return TransitionKey.__cmp(TransitionKey.__component_name_order[a],

- TransitionKey.__component_name_order[b])

- @staticmethod

- def __component_compare(a, b):

- '''component-wise compare function, returns the following values when

- comparing non identical numerical ranges:

- non-overlapping : -2 -- a0 <= a1 < b0 <= b1

- b subset of a : -3 -- a0 < b0 <= b1 <= a1

- a subset of b : -4 -- a0 == b0 and a1 < b1

- a overlaps to left : -5 -- a0 < b0 and b0 <= a1 < b1

- otherwise a value in [-1, 1] is returned'''

- if a.name() != b.name():

- return TransitionKey.__component_name_compare(a.name(), b.name())

- if a.name() != 'NUMERIC_RANGE_KEY':

- return TransitionKey.__cmp(str(a), str(b))

- (a, b) = (a.args(), b.args())

- c0 = TransitionKey.__cmp(a[0], b[0])

- if c0 == 0:

- return 4 * TransitionKey.__cmp(a[1], b[1]) # either == or a subset

- sign = -1

- if c0 > 0:

- (a, b, sign) = (b, a, 1) # normalize ordering so that a0 < b0

- assert a[0] < b[0]

- if b[1] <= a[1]: # subset

- return 3 * sign

- if a[1] < b[0]: # non overlap

- return 2 * sign

- return 5 * sign # partial overlap

- @staticmethod

- def __is_composable(term):

- return term.name() != 'EPSILON_KEY' and term.name() != 'OMEGA_KEY'

- @staticmethod

- def __construct(encoding, components):

- if isinstance(components, Term):

- components = [components]

- is_unique = False

- acc = []

- last = Term.empty_term()

- for current in TransitionKey.__flatten_components(components):

- name = current.name()

- # verify arguments

- if name == 'EPSILON_KEY' or name == 'OMEGA_KEY' or name == 'TERM_KEY':

- pass

- elif name == 'NUMERIC_RANGE_KEY':

- assert encoding.is_primary_range(current.args())

- if last.name() == 'NUMERIC_RANGE_KEY':

- assert last.args()[1] + 1 < current.args()[0], 'ranges must be merged'

- elif name == 'NAMED_RANGE_KEY':

- assert encoding.named_range(current.args()[0])

- else:

- raise Exception('illegal component %s' % str(current))

- # verify ordering, composability

- if last:

- assert TransitionKey.__is_composable(current), 'cannot compose'

- if len(acc) == 1:

- assert TransitionKey.__is_composable(last), 'cannot compose'

- c = TransitionKey.__component_compare(last, current)

- assert c == -1 or c == -2, 'bad order %s %s' % (str(last), str(current))

- acc.append(current)

- last = current

- assert acc, "must have components"

- return acc[0] if len(acc) == 1 else Term('COMPOSITE_KEY', *acc)

- def __init__(self, encoding, components):

- self.__term = TransitionKey.__construct(encoding, components)

- self.__cached_hash = None

- def to_string(self, encoding):

- return ', '.join(map(lambda x : TransitionKey.__component_str(encoding, x),

- self.__flatten()))

- def __str__(self):

- return self.to_string(None)

- @staticmethod

- def __disjoint_keys(encoding, range_map):

- '''Takes a set of possibly overlapping ranges, returns a list of ranges

- which don't overlap and which cover the same points as the original

- set. range_map is a map from lower bounds to a list of upper bounds.'''

- sort = lambda x : sorted(set(x))

- range_map = sorted(map(lambda (k, v): (k, sort(v)), range_map.items()))

- upper_bound = encoding.upper_bound() + 1

- for i in range(len(range_map)):

- (left, left_values) = range_map[i]

- next = range_map[i + 1][0] if i != len(range_map) - 1 else upper_bound

- to_push = []

- for v in left_values:

- assert left <= next

- if v >= next:

- if not to_push and left < next:

- yield (left, next - 1)

- to_push.append(v)

- else:

- yield (left, v)

- left = v + 1

- if to_push:

- current = range_map[i + 1]

- range_map[i + 1] = (current[0], sort(current[1] + to_push))

- @staticmethod

- def __merge_ranges(encoding, ranges):

- last = None

- for r in ranges:

- if last == None:

- last = r

- elif last[1] + 1 == r[0]:

- last = (last[0], r[1])

- else:

- yield last

- last = r

- if last != None:

- yield last

- @staticmethod

- def __flatten_keys(keys):

- return chain(*map(lambda x : x.__flatten(), keys))

- @staticmethod

- def __disjoint_components_from_keys(encoding, keys, merge_ranges = False):

- return TransitionKey.__disjoint_components(

- encoding, TransitionKey.__flatten_keys(keys), merge_ranges)

- @staticmethod

- def __disjoint_components(encoding, components, merge_ranges):

- range_map = {}

- other_keys = set([])

- for x in components:

- if x.name() != 'NUMERIC_RANGE_KEY':

- other_keys.add(x)

- continue

- (start, end) = x.args()

- if not start in range_map:

- range_map[start] = []

- range_map[start].append(end)

- ranges = TransitionKey.__disjoint_keys(encoding, range_map)

- if merge_ranges:

- ranges = TransitionKey.__merge_ranges(encoding, ranges)

- other_keys = sorted(other_keys, cmp=TransitionKey.__component_compare)

- range_terms = map(

- lambda x : encoding.numeric_range_term(x[0], x[1]), ranges)

- return chain(iter(range_terms), iter(other_keys))

- @staticmethod

- def __key_from_components(encoding, invert, components):

- components = TransitionKey.__disjoint_components(encoding, components, True)

- if invert:

- components = TransitionKey.__invert_components(encoding, components)

- return None if not components else TransitionKey(encoding, components)

- @staticmethod

- def disjoint_keys(encoding, keys):

- '''Takes a set of possibly overlapping TransitionKeys, returns a list of

- TransitionKeys which don't overlap and whose union is the same as the union

- of the original key_set. In addition, TransitionKeys are not merged, only

- split.

- For example, if key_set contains two TransitionKeys for ranges [1-10] and

- [5-15], disjoint_keys returns a set of three TransitionKeys: [1-4], [5-10],

- [11-16].'''

- return map(lambda x : TransitionKey(encoding, x),

- TransitionKey.__disjoint_components_from_keys(encoding, keys))

- @staticmethod

- def inverse_key(encoding, keys):

- '''Returns a TransitionKey which matches represents the inverse of the union

- of 'keys'. The TransitionKeys contain a set of character ranges and a set

- of classes. The character ranges are inverted in relation to the latin_1

- character range, and the character classes are inverted in relation to all

- character classes in __class_bounds.'''

- return TransitionKey.__key_from_components(

- encoding, True, TransitionKey.__flatten_keys(keys))

- @staticmethod

- def merged_key(encoding, keys):

- return TransitionKey(encoding,

- TransitionKey.__disjoint_components_from_keys(encoding, keys, True))

- @staticmethod

- def __invert_components(encoding, components):

- key = lambda x, y: encoding.numeric_range_term(x, y)

- last = None

- classes = set(encoding.named_range_value_iter())

- for c in components:

- if c in classes:

- classes.remove(c)

- continue

- assert c.name() == 'NUMERIC_RANGE_KEY', 'unencoded key not invertible'

- r = c.args()

- assert encoding.is_primary_range(r)

- if last == None:

- if r[0] != encoding.lower_bound():

- yield key(encoding.lower_bound(), r[0] - 1)

- elif last[1] + 1 < r[0]:

- yield key(last[1] + 1, r[0] - 1)

- last = r

- upper_bound = encoding.primary_range()[1]

- if last == None:

- r = encoding.primary_range()

- yield key(r[0], r[1])

- elif last[1] < upper_bound:

- yield key(last[1] + 1, upper_bound)

- for c in sorted(classes, TransitionKey.__component_compare):

- yield c

« no previous file with comments | « tools/lexer_generator/transition_key.py ('k') | no next file » | no next file with comments »