OLD | NEW |
(Empty) | |
| 1 #======================================================================= |
| 2 # |
| 3 # Python Lexical Analyser |
| 4 # |
| 5 # Regular Expressions |
| 6 # |
| 7 #======================================================================= |
| 8 |
| 9 import types |
| 10 from sys import maxint as maxint |
| 11 |
| 12 import Errors |
| 13 |
| 14 # |
| 15 # Constants |
| 16 # |
| 17 |
| 18 BOL = 'bol' |
| 19 EOL = 'eol' |
| 20 EOF = 'eof' |
| 21 |
| 22 nl_code = ord('\n') |
| 23 |
| 24 # |
| 25 # Helper functions |
| 26 # |
| 27 |
| 28 def chars_to_ranges(s): |
| 29 """ |
| 30 Return a list of character codes consisting of pairs |
| 31 [code1a, code1b, code2a, code2b,...] which cover all |
| 32 the characters in |s|. |
| 33 """ |
| 34 char_list = list(s) |
| 35 char_list.sort() |
| 36 i = 0 |
| 37 n = len(char_list) |
| 38 result = [] |
| 39 while i < n: |
| 40 code1 = ord(char_list[i]) |
| 41 code2 = code1 + 1 |
| 42 i = i + 1 |
| 43 while i < n and code2 >= ord(char_list[i]): |
| 44 code2 = code2 + 1 |
| 45 i = i + 1 |
| 46 result.append(code1) |
| 47 result.append(code2) |
| 48 return result |
| 49 |
| 50 def uppercase_range(code1, code2): |
| 51 """ |
| 52 If the range of characters from code1 to code2-1 includes any |
| 53 lower case letters, return the corresponding upper case range. |
| 54 """ |
| 55 code3 = max(code1, ord('a')) |
| 56 code4 = min(code2, ord('z') + 1) |
| 57 if code3 < code4: |
| 58 d = ord('A') - ord('a') |
| 59 return (code3 + d, code4 + d) |
| 60 else: |
| 61 return None |
| 62 |
| 63 def lowercase_range(code1, code2): |
| 64 """ |
| 65 If the range of characters from code1 to code2-1 includes any |
| 66 upper case letters, return the corresponding lower case range. |
| 67 """ |
| 68 code3 = max(code1, ord('A')) |
| 69 code4 = min(code2, ord('Z') + 1) |
| 70 if code3 < code4: |
| 71 d = ord('a') - ord('A') |
| 72 return (code3 + d, code4 + d) |
| 73 else: |
| 74 return None |
| 75 |
| 76 def CodeRanges(code_list): |
| 77 """ |
| 78 Given a list of codes as returned by chars_to_ranges, return |
| 79 an RE which will match a character in any of the ranges. |
| 80 """ |
| 81 re_list = [] |
| 82 for i in xrange(0, len(code_list), 2): |
| 83 re_list.append(CodeRange(code_list[i], code_list[i + 1])) |
| 84 return Alt(*re_list) |
| 85 |
| 86 def CodeRange(code1, code2): |
| 87 """ |
| 88 CodeRange(code1, code2) is an RE which matches any character |
| 89 with a code |c| in the range |code1| <= |c| < |code2|. |
| 90 """ |
| 91 if code1 <= nl_code < code2: |
| 92 return Alt(RawCodeRange(code1, nl_code), |
| 93 RawNewline, |
| 94 RawCodeRange(nl_code + 1, code2)) |
| 95 else: |
| 96 return RawCodeRange(code1, code2) |
| 97 |
| 98 # |
| 99 # Abstract classes |
| 100 # |
| 101 |
| 102 class RE(object): |
| 103 """RE is the base class for regular expression constructors. |
| 104 The following operators are defined on REs: |
| 105 |
| 106 re1 + re2 is an RE which matches |re1| followed by |re2| |
| 107 re1 | re2 is an RE which matches either |re1| or |re2| |
| 108 """ |
| 109 |
| 110 nullable = 1 # True if this RE can match 0 input symbols |
| 111 match_nl = 1 # True if this RE can match a string ending with '\n' |
| 112 str = None # Set to a string to override the class's __str__ result |
| 113 |
| 114 def build_machine(self, machine, initial_state, final_state, |
| 115 match_bol, nocase): |
| 116 """ |
| 117 This method should add states to |machine| to implement this |
| 118 RE, starting at |initial_state| and ending at |final_state|. |
| 119 If |match_bol| is true, the RE must be able to match at the |
| 120 beginning of a line. If nocase is true, upper and lower case |
| 121 letters should be treated as equivalent. |
| 122 """ |
| 123 raise NotImplementedError("%s.build_machine not implemented" % |
| 124 self.__class__.__name__) |
| 125 |
| 126 def build_opt(self, m, initial_state, c): |
| 127 """ |
| 128 Given a state |s| of machine |m|, return a new state |
| 129 reachable from |s| on character |c| or epsilon. |
| 130 """ |
| 131 s = m.new_state() |
| 132 initial_state.link_to(s) |
| 133 initial_state.add_transition(c, s) |
| 134 return s |
| 135 |
| 136 def __add__(self, other): |
| 137 return Seq(self, other) |
| 138 |
| 139 def __or__(self, other): |
| 140 return Alt(self, other) |
| 141 |
| 142 def __str__(self): |
| 143 if self.str: |
| 144 return self.str |
| 145 else: |
| 146 return self.calc_str() |
| 147 |
| 148 def check_re(self, num, value): |
| 149 if not isinstance(value, RE): |
| 150 self.wrong_type(num, value, "Plex.RE instance") |
| 151 |
| 152 def check_string(self, num, value): |
| 153 if type(value) != type(''): |
| 154 self.wrong_type(num, value, "string") |
| 155 |
| 156 def check_char(self, num, value): |
| 157 self.check_string(num, value) |
| 158 if len(value) != 1: |
| 159 raise Errors.PlexValueError("Invalid value for argument %d of Plex.%
s." |
| 160 "Expected a string of length 1, got: %s" % ( |
| 161 num, self.__class__.__name__, repr(value))) |
| 162 |
| 163 def wrong_type(self, num, value, expected): |
| 164 if type(value) == types.InstanceType: |
| 165 got = "%s.%s instance" % ( |
| 166 value.__class__.__module__, value.__class__.__name__) |
| 167 else: |
| 168 got = type(value).__name__ |
| 169 raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s " |
| 170 "(expected %s, got %s" % ( |
| 171 num, self.__class__.__name__, expect
ed, got)) |
| 172 |
| 173 # |
| 174 # Primitive RE constructors |
| 175 # ------------------------- |
| 176 # |
| 177 # These are the basic REs from which all others are built. |
| 178 # |
| 179 |
| 180 ## class Char(RE): |
| 181 ## """ |
| 182 ## Char(c) is an RE which matches the character |c|. |
| 183 ## """ |
| 184 |
| 185 ## nullable = 0 |
| 186 |
| 187 ## def __init__(self, char): |
| 188 ## self.char = char |
| 189 ## self.match_nl = char == '\n' |
| 190 |
| 191 ## def build_machine(self, m, initial_state, final_state, match_bol, nocase)
: |
| 192 ## c = self.char |
| 193 ## if match_bol and c != BOL: |
| 194 ## s1 = self.build_opt(m, initial_state, BOL) |
| 195 ## else: |
| 196 ## s1 = initial_state |
| 197 ## if c == '\n' or c == EOF: |
| 198 ## s1 = self.build_opt(m, s1, EOL) |
| 199 ## if len(c) == 1: |
| 200 ## code = ord(self.char) |
| 201 ## s1.add_transition((code, code+1), final_state) |
| 202 ## if nocase and is_letter_code(code): |
| 203 ## code2 = other_case_code(code) |
| 204 ## s1.add_transition((code2, code2+1), final_state) |
| 205 ## else: |
| 206 ## s1.add_transition(c, final_state) |
| 207 |
| 208 ## def calc_str(self): |
| 209 ## return "Char(%s)" % repr(self.char) |
| 210 |
| 211 def Char(c): |
| 212 """ |
| 213 Char(c) is an RE which matches the character |c|. |
| 214 """ |
| 215 if len(c) == 1: |
| 216 result = CodeRange(ord(c), ord(c) + 1) |
| 217 else: |
| 218 result = SpecialSymbol(c) |
| 219 result.str = "Char(%s)" % repr(c) |
| 220 return result |
| 221 |
| 222 class RawCodeRange(RE): |
| 223 """ |
| 224 RawCodeRange(code1, code2) is a low-level RE which matches any character |
| 225 with a code |c| in the range |code1| <= |c| < |code2|, where the range |
| 226 does not include newline. For internal use only. |
| 227 """ |
| 228 nullable = 0 |
| 229 match_nl = 0 |
| 230 range = None # (code, code) |
| 231 uppercase_range = None # (code, code) or None |
| 232 lowercase_range = None # (code, code) or None |
| 233 |
| 234 def __init__(self, code1, code2): |
| 235 self.range = (code1, code2) |
| 236 self.uppercase_range = uppercase_range(code1, code2) |
| 237 self.lowercase_range = lowercase_range(code1, code2) |
| 238 |
| 239 def build_machine(self, m, initial_state, final_state, match_bol, nocase): |
| 240 if match_bol: |
| 241 initial_state = self.build_opt(m, initial_state, BOL) |
| 242 initial_state.add_transition(self.range, final_state) |
| 243 if nocase: |
| 244 if self.uppercase_range: |
| 245 initial_state.add_transition(self.uppercase_range, final_state) |
| 246 if self.lowercase_range: |
| 247 initial_state.add_transition(self.lowercase_range, final_state) |
| 248 |
| 249 def calc_str(self): |
| 250 return "CodeRange(%d,%d)" % (self.code1, self.code2) |
| 251 |
| 252 class _RawNewline(RE): |
| 253 """ |
| 254 RawNewline is a low-level RE which matches a newline character. |
| 255 For internal use only. |
| 256 """ |
| 257 nullable = 0 |
| 258 match_nl = 1 |
| 259 |
| 260 def build_machine(self, m, initial_state, final_state, match_bol, nocase): |
| 261 if match_bol: |
| 262 initial_state = self.build_opt(m, initial_state, BOL) |
| 263 s = self.build_opt(m, initial_state, EOL) |
| 264 s.add_transition((nl_code, nl_code + 1), final_state) |
| 265 |
| 266 RawNewline = _RawNewline() |
| 267 |
| 268 |
| 269 class SpecialSymbol(RE): |
| 270 """ |
| 271 SpecialSymbol(sym) is an RE which matches the special input |
| 272 symbol |sym|, which is one of BOL, EOL or EOF. |
| 273 """ |
| 274 nullable = 0 |
| 275 match_nl = 0 |
| 276 sym = None |
| 277 |
| 278 def __init__(self, sym): |
| 279 self.sym = sym |
| 280 |
| 281 def build_machine(self, m, initial_state, final_state, match_bol, nocase): |
| 282 # Sequences 'bol bol' and 'bol eof' are impossible, so only need |
| 283 # to allow for bol if sym is eol |
| 284 if match_bol and self.sym == EOL: |
| 285 initial_state = self.build_opt(m, initial_state, BOL) |
| 286 initial_state.add_transition(self.sym, final_state) |
| 287 |
| 288 |
| 289 class Seq(RE): |
| 290 """Seq(re1, re2, re3...) is an RE which matches |re1| followed by |
| 291 |re2| followed by |re3|...""" |
| 292 |
| 293 def __init__(self, *re_list): |
| 294 nullable = 1 |
| 295 for i in xrange(len(re_list)): |
| 296 re = re_list[i] |
| 297 self.check_re(i, re) |
| 298 nullable = nullable and re.nullable |
| 299 self.re_list = re_list |
| 300 self.nullable = nullable |
| 301 i = len(re_list) |
| 302 match_nl = 0 |
| 303 while i: |
| 304 i = i - 1 |
| 305 re = re_list[i] |
| 306 if re.match_nl: |
| 307 match_nl = 1 |
| 308 break |
| 309 if not re.nullable: |
| 310 break |
| 311 self.match_nl = match_nl |
| 312 |
| 313 def build_machine(self, m, initial_state, final_state, match_bol, nocase): |
| 314 re_list = self.re_list |
| 315 if len(re_list) == 0: |
| 316 initial_state.link_to(final_state) |
| 317 else: |
| 318 s1 = initial_state |
| 319 n = len(re_list) |
| 320 for i in xrange(n): |
| 321 if i < n - 1: |
| 322 s2 = m.new_state() |
| 323 else: |
| 324 s2 = final_state |
| 325 re = re_list[i] |
| 326 re.build_machine(m, s1, s2, match_bol, nocase) |
| 327 s1 = s2 |
| 328 match_bol = re.match_nl or (match_bol and re.nullable) |
| 329 |
| 330 def calc_str(self): |
| 331 return "Seq(%s)" % ','.join(map(str, self.re_list)) |
| 332 |
| 333 |
| 334 class Alt(RE): |
| 335 """Alt(re1, re2, re3...) is an RE which matches either |re1| or |
| 336 |re2| or |re3|...""" |
| 337 |
| 338 def __init__(self, *re_list): |
| 339 self.re_list = re_list |
| 340 nullable = 0 |
| 341 match_nl = 0 |
| 342 nullable_res = [] |
| 343 non_nullable_res = [] |
| 344 i = 1 |
| 345 for re in re_list: |
| 346 self.check_re(i, re) |
| 347 if re.nullable: |
| 348 nullable_res.append(re) |
| 349 nullable = 1 |
| 350 else: |
| 351 non_nullable_res.append(re) |
| 352 if re.match_nl: |
| 353 match_nl = 1 |
| 354 i = i + 1 |
| 355 self.nullable_res = nullable_res |
| 356 self.non_nullable_res = non_nullable_res |
| 357 self.nullable = nullable |
| 358 self.match_nl = match_nl |
| 359 |
| 360 def build_machine(self, m, initial_state, final_state, match_bol, nocase): |
| 361 for re in self.nullable_res: |
| 362 re.build_machine(m, initial_state, final_state, match_bol, nocase) |
| 363 if self.non_nullable_res: |
| 364 if match_bol: |
| 365 initial_state = self.build_opt(m, initial_state, BOL) |
| 366 for re in self.non_nullable_res: |
| 367 re.build_machine(m, initial_state, final_state, 0, nocase) |
| 368 |
| 369 def calc_str(self): |
| 370 return "Alt(%s)" % ','.join(map(str, self.re_list)) |
| 371 |
| 372 |
| 373 class Rep1(RE): |
| 374 """Rep1(re) is an RE which matches one or more repetitions of |re|.""" |
| 375 |
| 376 def __init__(self, re): |
| 377 self.check_re(1, re) |
| 378 self.re = re |
| 379 self.nullable = re.nullable |
| 380 self.match_nl = re.match_nl |
| 381 |
| 382 def build_machine(self, m, initial_state, final_state, match_bol, nocase): |
| 383 s1 = m.new_state() |
| 384 s2 = m.new_state() |
| 385 initial_state.link_to(s1) |
| 386 self.re.build_machine(m, s1, s2, match_bol or self.re.match_nl, nocase) |
| 387 s2.link_to(s1) |
| 388 s2.link_to(final_state) |
| 389 |
| 390 def calc_str(self): |
| 391 return "Rep1(%s)" % self.re |
| 392 |
| 393 |
| 394 class SwitchCase(RE): |
| 395 """ |
| 396 SwitchCase(re, nocase) is an RE which matches the same strings as RE, |
| 397 but treating upper and lower case letters according to |nocase|. If |
| 398 |nocase| is true, case is ignored, otherwise it is not. |
| 399 """ |
| 400 re = None |
| 401 nocase = None |
| 402 |
| 403 def __init__(self, re, nocase): |
| 404 self.re = re |
| 405 self.nocase = nocase |
| 406 self.nullable = re.nullable |
| 407 self.match_nl = re.match_nl |
| 408 |
| 409 def build_machine(self, m, initial_state, final_state, match_bol, nocase): |
| 410 self.re.build_machine(m, initial_state, final_state, match_bol, |
| 411 self.nocase) |
| 412 |
| 413 def calc_str(self): |
| 414 if self.nocase: |
| 415 name = "NoCase" |
| 416 else: |
| 417 name = "Case" |
| 418 return "%s(%s)" % (name, self.re) |
| 419 |
| 420 # |
| 421 # Composite RE constructors |
| 422 # ------------------------- |
| 423 # |
| 424 # These REs are defined in terms of the primitive REs. |
| 425 # |
| 426 |
| 427 Empty = Seq() |
| 428 Empty.__doc__ = \ |
| 429 """ |
| 430 Empty is an RE which matches the empty string. |
| 431 """ |
| 432 Empty.str = "Empty" |
| 433 |
| 434 def Str1(s): |
| 435 """ |
| 436 Str1(s) is an RE which matches the literal string |s|. |
| 437 """ |
| 438 result = Seq(*tuple(map(Char, s))) |
| 439 result.str = "Str(%s)" % repr(s) |
| 440 return result |
| 441 |
| 442 def Str(*strs): |
| 443 """ |
| 444 Str(s) is an RE which matches the literal string |s|. |
| 445 Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|... |
| 446 """ |
| 447 if len(strs) == 1: |
| 448 return Str1(strs[0]) |
| 449 else: |
| 450 result = Alt(*tuple(map(Str1, strs))) |
| 451 result.str = "Str(%s)" % ','.join(map(repr, strs)) |
| 452 return result |
| 453 |
| 454 def Any(s): |
| 455 """ |
| 456 Any(s) is an RE which matches any character in the string |s|. |
| 457 """ |
| 458 #result = apply(Alt, tuple(map(Char, s))) |
| 459 result = CodeRanges(chars_to_ranges(s)) |
| 460 result.str = "Any(%s)" % repr(s) |
| 461 return result |
| 462 |
| 463 def AnyBut(s): |
| 464 """ |
| 465 AnyBut(s) is an RE which matches any character (including |
| 466 newline) which is not in the string |s|. |
| 467 """ |
| 468 ranges = chars_to_ranges(s) |
| 469 ranges.insert(0, -maxint) |
| 470 ranges.append(maxint) |
| 471 result = CodeRanges(ranges) |
| 472 result.str = "AnyBut(%s)" % repr(s) |
| 473 return result |
| 474 |
| 475 AnyChar = AnyBut("") |
| 476 AnyChar.__doc__ = \ |
| 477 """ |
| 478 AnyChar is an RE which matches any single character (including a newline). |
| 479 """ |
| 480 AnyChar.str = "AnyChar" |
| 481 |
| 482 def Range(s1, s2 = None): |
| 483 """ |
| 484 Range(c1, c2) is an RE which matches any single character in the range |
| 485 |c1| to |c2| inclusive. |
| 486 Range(s) where |s| is a string of even length is an RE which matches |
| 487 any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,... |
| 488 """ |
| 489 if s2: |
| 490 result = CodeRange(ord(s1), ord(s2) + 1) |
| 491 result.str = "Range(%s,%s)" % (s1, s2) |
| 492 else: |
| 493 ranges = [] |
| 494 for i in range(0, len(s1), 2): |
| 495 ranges.append(CodeRange(ord(s1[i]), ord(s1[i+1]) + 1)) |
| 496 result = Alt(*ranges) |
| 497 result.str = "Range(%s)" % repr(s1) |
| 498 return result |
| 499 |
| 500 def Opt(re): |
| 501 """ |
| 502 Opt(re) is an RE which matches either |re| or the empty string. |
| 503 """ |
| 504 result = Alt(re, Empty) |
| 505 result.str = "Opt(%s)" % re |
| 506 return result |
| 507 |
| 508 def Rep(re): |
| 509 """ |
| 510 Rep(re) is an RE which matches zero or more repetitions of |re|. |
| 511 """ |
| 512 result = Opt(Rep1(re)) |
| 513 result.str = "Rep(%s)" % re |
| 514 return result |
| 515 |
| 516 def NoCase(re): |
| 517 """ |
| 518 NoCase(re) is an RE which matches the same strings as RE, but treating |
| 519 upper and lower case letters as equivalent. |
| 520 """ |
| 521 return SwitchCase(re, nocase = 1) |
| 522 |
| 523 def Case(re): |
| 524 """ |
| 525 Case(re) is an RE which matches the same strings as RE, but treating |
| 526 upper and lower case letters as distinct, i.e. it cancels the effect |
| 527 of any enclosing NoCase(). |
| 528 """ |
| 529 return SwitchCase(re, nocase = 0) |
| 530 |
| 531 # |
| 532 # RE Constants |
| 533 # |
| 534 |
| 535 Bol = Char(BOL) |
| 536 Bol.__doc__ = \ |
| 537 """ |
| 538 Bol is an RE which matches the beginning of a line. |
| 539 """ |
| 540 Bol.str = "Bol" |
| 541 |
| 542 Eol = Char(EOL) |
| 543 Eol.__doc__ = \ |
| 544 """ |
| 545 Eol is an RE which matches the end of a line. |
| 546 """ |
| 547 Eol.str = "Eol" |
| 548 |
| 549 Eof = Char(EOF) |
| 550 Eof.__doc__ = \ |
| 551 """ |
| 552 Eof is an RE which matches the end of the file. |
| 553 """ |
| 554 Eof.str = "Eof" |
| 555 |
OLD | NEW |