| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/python | |
| 2 # coding=utf-8 | |
| 3 # | |
| 4 # Copyright 2008 The RE2 Authors. All Rights Reserved. | |
| 5 # Use of this source code is governed by a BSD-style | |
| 6 # license that can be found in the LICENSE file. | |
| 7 | |
| 8 # See unicode_casefold.h for description of case folding tables. | |
| 9 | |
| 10 """Generate C++ table for Unicode case folding.""" | |
| 11 | |
| 12 import sys | |
| 13 import unicode | |
| 14 | |
| 15 _header = """ | |
| 16 // GENERATED BY make_unicode_casefold.py; DO NOT EDIT. | |
| 17 // make_unicode_casefold.py >unicode_casefold.cc | |
| 18 | |
| 19 #include "re2/unicode_casefold.h" | |
| 20 | |
| 21 namespace re2 { | |
| 22 | |
| 23 """ | |
| 24 | |
| 25 _trailer = """ | |
| 26 | |
| 27 } // namespace re2 | |
| 28 | |
| 29 """ | |
| 30 | |
| 31 def _Delta(a, b): | |
| 32 """Compute the delta for b - a. Even/odd and odd/even | |
| 33 are handled specially, as described above.""" | |
| 34 if a+1 == b: | |
| 35 if a%2 == 0: | |
| 36 return 'EvenOdd' | |
| 37 else: | |
| 38 return 'OddEven' | |
| 39 if a == b+1: | |
| 40 if a%2 == 0: | |
| 41 return 'OddEven' | |
| 42 else: | |
| 43 return 'EvenOdd' | |
| 44 return b - a | |
| 45 | |
| 46 def _AddDelta(a, delta): | |
| 47 """Return a + delta, handling EvenOdd and OddEven specially.""" | |
| 48 if type(delta) == int: | |
| 49 return a+delta | |
| 50 if delta == 'EvenOdd': | |
| 51 if a%2 == 0: | |
| 52 return a+1 | |
| 53 else: | |
| 54 return a-1 | |
| 55 if delta == 'OddEven': | |
| 56 if a%2 == 1: | |
| 57 return a+1 | |
| 58 else: | |
| 59 return a-1 | |
| 60 print >>sys.stderr, "Bad Delta: ", delta | |
| 61 raise "Bad Delta" | |
| 62 | |
| 63 def _MakeRanges(pairs): | |
| 64 """Turn a list like [(65,97), (66, 98), ..., (90,122)] | |
| 65 into [(65, 90, +32)].""" | |
| 66 ranges = [] | |
| 67 last = -100 | |
| 68 | |
| 69 def evenodd(last, a, b, r): | |
| 70 if a != last+1 or b != _AddDelta(a, r[2]): | |
| 71 return False | |
| 72 r[1] = a | |
| 73 return True | |
| 74 | |
| 75 def evenoddpair(last, a, b, r): | |
| 76 if a != last+2: | |
| 77 return False | |
| 78 delta = r[2] | |
| 79 d = delta | |
| 80 if type(delta) is not str: | |
| 81 return False | |
| 82 if delta.endswith('Skip'): | |
| 83 d = delta[:-4] | |
| 84 else: | |
| 85 delta = d + 'Skip' | |
| 86 if b != _AddDelta(a, d): | |
| 87 return False | |
| 88 r[1] = a | |
| 89 r[2] = delta | |
| 90 return True | |
| 91 | |
| 92 for a, b in pairs: | |
| 93 if ranges and evenodd(last, a, b, ranges[-1]): | |
| 94 pass | |
| 95 elif ranges and evenoddpair(last, a, b, ranges[-1]): | |
| 96 pass | |
| 97 else: | |
| 98 ranges.append([a, a, _Delta(a, b)]) | |
| 99 last = a | |
| 100 return ranges | |
| 101 | |
| 102 # The maximum size of a case-folding group. | |
| 103 # Case folding is implemented in parse.cc by a recursive process | |
| 104 # with a recursion depth equal to the size of the largest | |
| 105 # case-folding group, so it is important that this bound be small. | |
| 106 # The current tables have no group bigger than 4. | |
| 107 # If there are ever groups bigger than 10 or so, it will be | |
| 108 # time to rework the code in parse.cc. | |
| 109 MaxCasefoldGroup = 4 | |
| 110 | |
| 111 def main(): | |
| 112 lowergroups, casegroups = unicode.CaseGroups() | |
| 113 foldpairs = [] | |
| 114 seen = {} | |
| 115 for c in casegroups: | |
| 116 if len(c) > MaxCasefoldGroup: | |
| 117 raise unicode.Error("casefold group too long: %s" % (c,)) | |
| 118 for i in range(len(c)): | |
| 119 if c[i-1] in seen: | |
| 120 raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i])) | |
| 121 seen[c[i-1]] = True | |
| 122 foldpairs.append([c[i-1], c[i]]) | |
| 123 | |
| 124 lowerpairs = [] | |
| 125 for lower, group in lowergroups.iteritems(): | |
| 126 for g in group: | |
| 127 if g != lower: | |
| 128 lowerpairs.append([g, lower]) | |
| 129 | |
| 130 def printpairs(name, foldpairs): | |
| 131 foldpairs.sort() | |
| 132 foldranges = _MakeRanges(foldpairs) | |
| 133 print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs)
, len(foldranges)) | |
| 134 print "const CaseFold unicode_%s[] = {" % (name,) | |
| 135 for lo, hi, delta in foldranges: | |
| 136 print "\t{ %d, %d, %s }," % (lo, hi, delta) | |
| 137 print "};" | |
| 138 print "const int num_unicode_%s = %d;" % (name, len(foldranges),) | |
| 139 print "" | |
| 140 | |
| 141 print _header | |
| 142 printpairs("casefold", foldpairs) | |
| 143 printpairs("tolower", lowerpairs) | |
| 144 print _trailer | |
| 145 | |
| 146 if __name__ == '__main__': | |
| 147 main() | |
| OLD | NEW |