third_party/cython/src/Cython/Plex/Regexps.py - Issue 385073004: Adding cython v0.20.2 in third-party.

Side by Side Diff: third_party/cython/src/Cython/Plex/Regexps.py

Issue 385073004: Adding cython v0.20.2 in third-party. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Reference cython dev list thread. Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 #=======================================================================

	2 #

	3 # Python Lexical Analyser

	4 #

	5 # Regular Expressions

	6 #

	7 #=======================================================================

	8

	9 import types

	10 from sys import maxint as maxint

	11

	12 import Errors

	13

	14 #

	15 # Constants

	16 #

	17

	18 BOL = 'bol'

	19 EOL = 'eol'

	20 EOF = 'eof'

	21

	22 nl_code = ord('\n')

	23

	24 #

	25 # Helper functions

	26 #

	27

	28 def chars_to_ranges(s):

	29 """

	30 Return a list of character codes consisting of pairs

	31 [code1a, code1b, code2a, code2b,...] which cover all

	32 the characters in \|s\|.

	33 """

	34 char_list = list(s)

	35 char_list.sort()

	36 i = 0

	37 n = len(char_list)

	38 result = []

	39 while i < n:

	40 code1 = ord(char_list[i])

	41 code2 = code1 + 1

	42 i = i + 1

	43 while i < n and code2 >= ord(char_list[i]):

	44 code2 = code2 + 1

	45 i = i + 1

	46 result.append(code1)

	47 result.append(code2)

	48 return result

	49

	50 def uppercase_range(code1, code2):

	51 """

	52 If the range of characters from code1 to code2-1 includes any

	53 lower case letters, return the corresponding upper case range.

	54 """

	55 code3 = max(code1, ord('a'))

	56 code4 = min(code2, ord('z') + 1)

	57 if code3 < code4:

	58 d = ord('A') - ord('a')

	59 return (code3 + d, code4 + d)

	60 else:

	61 return None

	62

	63 def lowercase_range(code1, code2):

	64 """

	65 If the range of characters from code1 to code2-1 includes any

	66 upper case letters, return the corresponding lower case range.

	67 """

	68 code3 = max(code1, ord('A'))

	69 code4 = min(code2, ord('Z') + 1)

	70 if code3 < code4:

	71 d = ord('a') - ord('A')

	72 return (code3 + d, code4 + d)

	73 else:

	74 return None

	75

	76 def CodeRanges(code_list):

	77 """

	78 Given a list of codes as returned by chars_to_ranges, return

	79 an RE which will match a character in any of the ranges.

	80 """

	81 re_list = []

	82 for i in xrange(0, len(code_list), 2):

	83 re_list.append(CodeRange(code_list[i], code_list[i + 1]))

	84 return Alt(*re_list)

	85

	86 def CodeRange(code1, code2):

	87 """

	88 CodeRange(code1, code2) is an RE which matches any character

	89 with a code \|c\| in the range \|code1\| <= \|c\| < \|code2\|.

	90 """

	91 if code1 <= nl_code < code2:

	92 return Alt(RawCodeRange(code1, nl_code),

	93 RawNewline,

	94 RawCodeRange(nl_code + 1, code2))

	95 else:

	96 return RawCodeRange(code1, code2)

	97

	98 #

	99 # Abstract classes

	100 #

	101

	102 class RE(object):

	103 """RE is the base class for regular expression constructors.

	104 The following operators are defined on REs:

	105

	106 re1 + re2 is an RE which matches \|re1\| followed by \|re2\|

	107 re1 \| re2 is an RE which matches either \|re1\| or \|re2\|

	108 """

	109

	110 nullable = 1 # True if this RE can match 0 input symbols

	111 match_nl = 1 # True if this RE can match a string ending with '\n'

	112 str = None # Set to a string to override the class's __str__ result

	113

	114 def build_machine(self, machine, initial_state, final_state,

	115 match_bol, nocase):

	116 """

	117 This method should add states to \|machine\| to implement this

	118 RE, starting at \|initial_state\| and ending at \|final_state\|.

	119 If \|match_bol\| is true, the RE must be able to match at the

	120 beginning of a line. If nocase is true, upper and lower case

	121 letters should be treated as equivalent.

	122 """

	123 raise NotImplementedError("%s.build_machine not implemented" %

	124 self.__class__.__name__)

	125

	126 def build_opt(self, m, initial_state, c):

	127 """

	128 Given a state \|s\| of machine \|m\|, return a new state

	129 reachable from \|s\| on character \|c\| or epsilon.

	130 """

	131 s = m.new_state()

	132 initial_state.link_to(s)

	133 initial_state.add_transition(c, s)

	134 return s

	135

	136 def __add__(self, other):

	137 return Seq(self, other)

	138

	139 def __or__(self, other):

	140 return Alt(self, other)

	141

	142 def __str__(self):

	143 if self.str:

	144 return self.str

	145 else:

	146 return self.calc_str()

	147

	148 def check_re(self, num, value):

	149 if not isinstance(value, RE):

	150 self.wrong_type(num, value, "Plex.RE instance")

	151

	152 def check_string(self, num, value):

	153 if type(value) != type(''):

	154 self.wrong_type(num, value, "string")

	155

	156 def check_char(self, num, value):

	157 self.check_string(num, value)

	158 if len(value) != 1:

	159 raise Errors.PlexValueError("Invalid value for argument %d of Plex.% s."

	160 "Expected a string of length 1, got: %s" % (

	161 num, self.__class__.__name__, repr(value)))

	162

	163 def wrong_type(self, num, value, expected):

	164 if type(value) == types.InstanceType:

	165 got = "%s.%s instance" % (

	166 value.__class__.__module__, value.__class__.__name__)

	167 else:

	168 got = type(value).__name__

	169 raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s "

	170 "(expected %s, got %s" % (

	171 num, self.__class__.__name__, expect ed, got))

	172

	173 #

	174 # Primitive RE constructors

	175 # -------------------------

	176 #

	177 # These are the basic REs from which all others are built.

	178 #

	179

	180 ## class Char(RE):

	181 ## """

	182 ## Char(c) is an RE which matches the character \|c\|.

	183 ## """

	184

	185 ## nullable = 0

	186

	187 ## def __init__(self, char):

	188 ## self.char = char

	189 ## self.match_nl = char == '\n'

	190

	191 ## def build_machine(self, m, initial_state, final_state, match_bol, nocase) :

	192 ## c = self.char

	193 ## if match_bol and c != BOL:

	194 ## s1 = self.build_opt(m, initial_state, BOL)

	195 ## else:

	196 ## s1 = initial_state

	197 ## if c == '\n' or c == EOF:

	198 ## s1 = self.build_opt(m, s1, EOL)

	199 ## if len(c) == 1:

	200 ## code = ord(self.char)

	201 ## s1.add_transition((code, code+1), final_state)

	202 ## if nocase and is_letter_code(code):

	203 ## code2 = other_case_code(code)

	204 ## s1.add_transition((code2, code2+1), final_state)

	205 ## else:

	206 ## s1.add_transition(c, final_state)

	207

	208 ## def calc_str(self):

	209 ## return "Char(%s)" % repr(self.char)

	210

	211 def Char(c):

	212 """

	213 Char(c) is an RE which matches the character \|c\|.

	214 """

	215 if len(c) == 1:

	216 result = CodeRange(ord(c), ord(c) + 1)

	217 else:

	218 result = SpecialSymbol(c)

	219 result.str = "Char(%s)" % repr(c)

	220 return result

	221

	222 class RawCodeRange(RE):

	223 """

	224 RawCodeRange(code1, code2) is a low-level RE which matches any character

	225 with a code \|c\| in the range \|code1\| <= \|c\| < \|code2\|, where the range

	226 does not include newline. For internal use only.

	227 """

	228 nullable = 0

	229 match_nl = 0

	230 range = None # (code, code)

	231 uppercase_range = None # (code, code) or None

	232 lowercase_range = None # (code, code) or None

	233

	234 def __init__(self, code1, code2):

	235 self.range = (code1, code2)

	236 self.uppercase_range = uppercase_range(code1, code2)

	237 self.lowercase_range = lowercase_range(code1, code2)

	238

	239 def build_machine(self, m, initial_state, final_state, match_bol, nocase):

	240 if match_bol:

	241 initial_state = self.build_opt(m, initial_state, BOL)

	242 initial_state.add_transition(self.range, final_state)

	243 if nocase:

	244 if self.uppercase_range:

	245 initial_state.add_transition(self.uppercase_range, final_state)

	246 if self.lowercase_range:

	247 initial_state.add_transition(self.lowercase_range, final_state)

	248

	249 def calc_str(self):

	250 return "CodeRange(%d,%d)" % (self.code1, self.code2)

	251

	252 class _RawNewline(RE):

	253 """

	254 RawNewline is a low-level RE which matches a newline character.

	255 For internal use only.

	256 """

	257 nullable = 0

	258 match_nl = 1

	259

	260 def build_machine(self, m, initial_state, final_state, match_bol, nocase):

	261 if match_bol:

	262 initial_state = self.build_opt(m, initial_state, BOL)

	263 s = self.build_opt(m, initial_state, EOL)

	264 s.add_transition((nl_code, nl_code + 1), final_state)

	265

	266 RawNewline = _RawNewline()

	267

	268

	269 class SpecialSymbol(RE):

	270 """

	271 SpecialSymbol(sym) is an RE which matches the special input

	272 symbol \|sym\|, which is one of BOL, EOL or EOF.

	273 """

	274 nullable = 0

	275 match_nl = 0

	276 sym = None

	277

	278 def __init__(self, sym):

	279 self.sym = sym

	280

	281 def build_machine(self, m, initial_state, final_state, match_bol, nocase):

	282 # Sequences 'bol bol' and 'bol eof' are impossible, so only need

	283 # to allow for bol if sym is eol

	284 if match_bol and self.sym == EOL:

	285 initial_state = self.build_opt(m, initial_state, BOL)

	286 initial_state.add_transition(self.sym, final_state)

	287

	288

	289 class Seq(RE):

	290 """Seq(re1, re2, re3...) is an RE which matches \|re1\| followed by

	291 \|re2\| followed by \|re3\|..."""

	292

	293 def __init__(self, *re_list):

	294 nullable = 1

	295 for i in xrange(len(re_list)):

	296 re = re_list[i]

	297 self.check_re(i, re)

	298 nullable = nullable and re.nullable

	299 self.re_list = re_list

	300 self.nullable = nullable

	301 i = len(re_list)

	302 match_nl = 0

	303 while i:

	304 i = i - 1

	305 re = re_list[i]

	306 if re.match_nl:

	307 match_nl = 1

	308 break

	309 if not re.nullable:

	310 break

	311 self.match_nl = match_nl

	312

	313 def build_machine(self, m, initial_state, final_state, match_bol, nocase):

	314 re_list = self.re_list

	315 if len(re_list) == 0:

	316 initial_state.link_to(final_state)

	317 else:

	318 s1 = initial_state

	319 n = len(re_list)

	320 for i in xrange(n):

	321 if i < n - 1:

	322 s2 = m.new_state()

	323 else:

	324 s2 = final_state

	325 re = re_list[i]

	326 re.build_machine(m, s1, s2, match_bol, nocase)

	327 s1 = s2

	328 match_bol = re.match_nl or (match_bol and re.nullable)

	329

	330 def calc_str(self):

	331 return "Seq(%s)" % ','.join(map(str, self.re_list))

	332

	333

	334 class Alt(RE):

	335 """Alt(re1, re2, re3...) is an RE which matches either \|re1\| or

	336 \|re2\| or \|re3\|..."""

	337

	338 def __init__(self, *re_list):

	339 self.re_list = re_list

	340 nullable = 0

	341 match_nl = 0

	342 nullable_res = []

	343 non_nullable_res = []

	344 i = 1

	345 for re in re_list:

	346 self.check_re(i, re)

	347 if re.nullable:

	348 nullable_res.append(re)

	349 nullable = 1

	350 else:

	351 non_nullable_res.append(re)

	352 if re.match_nl:

	353 match_nl = 1

	354 i = i + 1

	355 self.nullable_res = nullable_res

	356 self.non_nullable_res = non_nullable_res

	357 self.nullable = nullable

	358 self.match_nl = match_nl

	359

	360 def build_machine(self, m, initial_state, final_state, match_bol, nocase):

	361 for re in self.nullable_res:

	362 re.build_machine(m, initial_state, final_state, match_bol, nocase)

	363 if self.non_nullable_res:

	364 if match_bol:

	365 initial_state = self.build_opt(m, initial_state, BOL)

	366 for re in self.non_nullable_res:

	367 re.build_machine(m, initial_state, final_state, 0, nocase)

	368

	369 def calc_str(self):

	370 return "Alt(%s)" % ','.join(map(str, self.re_list))

	371

	372

	373 class Rep1(RE):

	374 """Rep1(re) is an RE which matches one or more repetitions of \|re\|."""

	375

	376 def __init__(self, re):

	377 self.check_re(1, re)

	378 self.re = re

	379 self.nullable = re.nullable

	380 self.match_nl = re.match_nl

	381

	382 def build_machine(self, m, initial_state, final_state, match_bol, nocase):

	383 s1 = m.new_state()

	384 s2 = m.new_state()

	385 initial_state.link_to(s1)

	386 self.re.build_machine(m, s1, s2, match_bol or self.re.match_nl, nocase)

	387 s2.link_to(s1)

	388 s2.link_to(final_state)

	389

	390 def calc_str(self):

	391 return "Rep1(%s)" % self.re

	392

	393

	394 class SwitchCase(RE):

	395 """

	396 SwitchCase(re, nocase) is an RE which matches the same strings as RE,

	397 but treating upper and lower case letters according to \|nocase\|. If

	398 \|nocase\| is true, case is ignored, otherwise it is not.

	399 """

	400 re = None

	401 nocase = None

	402

	403 def __init__(self, re, nocase):

	404 self.re = re

	405 self.nocase = nocase

	406 self.nullable = re.nullable

	407 self.match_nl = re.match_nl

	408

	409 def build_machine(self, m, initial_state, final_state, match_bol, nocase):

	410 self.re.build_machine(m, initial_state, final_state, match_bol,

	411 self.nocase)

	412

	413 def calc_str(self):

	414 if self.nocase:

	415 name = "NoCase"

	416 else:

	417 name = "Case"

	418 return "%s(%s)" % (name, self.re)

	419

	420 #

	421 # Composite RE constructors

	422 # -------------------------

	423 #

	424 # These REs are defined in terms of the primitive REs.

	425 #

	426

	427 Empty = Seq()

	428 Empty.__doc__ = \

	429 """

	430 Empty is an RE which matches the empty string.

	431 """

	432 Empty.str = "Empty"

	433

	434 def Str1(s):

	435 """

	436 Str1(s) is an RE which matches the literal string \|s\|.

	437 """

	438 result = Seq(*tuple(map(Char, s)))

	439 result.str = "Str(%s)" % repr(s)

	440 return result

	441

	442 def Str(*strs):

	443 """

	444 Str(s) is an RE which matches the literal string \|s\|.

	445 Str(s1, s2, s3, ...) is an RE which matches any of \|s1\| or \|s2\| or \|s3\|...

	446 """

	447 if len(strs) == 1:

	448 return Str1(strs[0])

	449 else:

	450 result = Alt(*tuple(map(Str1, strs)))

	451 result.str = "Str(%s)" % ','.join(map(repr, strs))

	452 return result

	453

	454 def Any(s):

	455 """

	456 Any(s) is an RE which matches any character in the string \|s\|.

	457 """

	458 #result = apply(Alt, tuple(map(Char, s)))

	459 result = CodeRanges(chars_to_ranges(s))

	460 result.str = "Any(%s)" % repr(s)

	461 return result

	462

	463 def AnyBut(s):

	464 """

	465 AnyBut(s) is an RE which matches any character (including

	466 newline) which is not in the string \|s\|.

	467 """

	468 ranges = chars_to_ranges(s)

	469 ranges.insert(0, -maxint)

	470 ranges.append(maxint)

	471 result = CodeRanges(ranges)

	472 result.str = "AnyBut(%s)" % repr(s)

	473 return result

	474

	475 AnyChar = AnyBut("")

	476 AnyChar.__doc__ = \

	477 """

	478 AnyChar is an RE which matches any single character (including a newline).

	479 """

	480 AnyChar.str = "AnyChar"

	481

	482 def Range(s1, s2 = None):

	483 """

	484 Range(c1, c2) is an RE which matches any single character in the range

	485 \|c1\| to \|c2\| inclusive.

	486 Range(s) where \|s\| is a string of even length is an RE which matches

	487 any single character in the ranges \|s[0]\| to \|s[1]\|, \|s[2]\| to \|s[3]\|,...

	488 """

	489 if s2:

	490 result = CodeRange(ord(s1), ord(s2) + 1)

	491 result.str = "Range(%s,%s)" % (s1, s2)

	492 else:

	493 ranges = []

	494 for i in range(0, len(s1), 2):

	495 ranges.append(CodeRange(ord(s1[i]), ord(s1[i+1]) + 1))

	496 result = Alt(*ranges)

	497 result.str = "Range(%s)" % repr(s1)

	498 return result

	499

	500 def Opt(re):

	501 """

	502 Opt(re) is an RE which matches either \|re\| or the empty string.

	503 """

	504 result = Alt(re, Empty)

	505 result.str = "Opt(%s)" % re

	506 return result

	507

	508 def Rep(re):

	509 """

	510 Rep(re) is an RE which matches zero or more repetitions of \|re\|.

	511 """

	512 result = Opt(Rep1(re))

	513 result.str = "Rep(%s)" % re

	514 return result

	515

	516 def NoCase(re):

	517 """

	518 NoCase(re) is an RE which matches the same strings as RE, but treating

	519 upper and lower case letters as equivalent.

	520 """

	521 return SwitchCase(re, nocase = 1)

	522

	523 def Case(re):

	524 """

	525 Case(re) is an RE which matches the same strings as RE, but treating

	526 upper and lower case letters as distinct, i.e. it cancels the effect

	527 of any enclosing NoCase().

	528 """

	529 return SwitchCase(re, nocase = 0)

	530

	531 #

	532 # RE Constants

	533 #

	534

	535 Bol = Char(BOL)

	536 Bol.__doc__ = \

	537 """

	538 Bol is an RE which matches the beginning of a line.

	539 """

	540 Bol.str = "Bol"

	541

	542 Eol = Char(EOL)

	543 Eol.__doc__ = \

	544 """

	545 Eol is an RE which matches the end of a line.

	546 """

	547 Eol.str = "Eol"

	548

	549 Eof = Char(EOF)

	550 Eof.__doc__ = \

	551 """

	552 Eof is an RE which matches the end of the file.

	553 """

	554 Eof.str = "Eof"

	555

OLD	NEW

« no previous file with comments | « third_party/cython/src/Cython/Plex/Machines.py ('k') | third_party/cython/src/Cython/Plex/Scanners.pxd » ('j') | no next file with comments »