OLD | NEW |
| (Empty) |
1 #!/usr/bin/python | |
2 # coding=utf-8 | |
3 # | |
4 # Copyright 2008 The RE2 Authors. All Rights Reserved. | |
5 # Use of this source code is governed by a BSD-style | |
6 # license that can be found in the LICENSE file. | |
7 | |
8 # See unicode_casefold.h for description of case folding tables. | |
9 | |
10 """Generate C++ table for Unicode case folding.""" | |
11 | |
12 import sys | |
13 import unicode | |
14 | |
15 _header = """ | |
16 // GENERATED BY make_unicode_casefold.py; DO NOT EDIT. | |
17 // make_unicode_casefold.py >unicode_casefold.cc | |
18 | |
19 #include "re2/unicode_casefold.h" | |
20 | |
21 namespace re2 { | |
22 | |
23 """ | |
24 | |
25 _trailer = """ | |
26 | |
27 } // namespace re2 | |
28 | |
29 """ | |
30 | |
31 def _Delta(a, b): | |
32 """Compute the delta for b - a. Even/odd and odd/even | |
33 are handled specially, as described above.""" | |
34 if a+1 == b: | |
35 if a%2 == 0: | |
36 return 'EvenOdd' | |
37 else: | |
38 return 'OddEven' | |
39 if a == b+1: | |
40 if a%2 == 0: | |
41 return 'OddEven' | |
42 else: | |
43 return 'EvenOdd' | |
44 return b - a | |
45 | |
46 def _AddDelta(a, delta): | |
47 """Return a + delta, handling EvenOdd and OddEven specially.""" | |
48 if type(delta) == int: | |
49 return a+delta | |
50 if delta == 'EvenOdd': | |
51 if a%2 == 0: | |
52 return a+1 | |
53 else: | |
54 return a-1 | |
55 if delta == 'OddEven': | |
56 if a%2 == 1: | |
57 return a+1 | |
58 else: | |
59 return a-1 | |
60 print >>sys.stderr, "Bad Delta: ", delta | |
61 raise "Bad Delta" | |
62 | |
63 def _MakeRanges(pairs): | |
64 """Turn a list like [(65,97), (66, 98), ..., (90,122)] | |
65 into [(65, 90, +32)].""" | |
66 ranges = [] | |
67 last = -100 | |
68 | |
69 def evenodd(last, a, b, r): | |
70 if a != last+1 or b != _AddDelta(a, r[2]): | |
71 return False | |
72 r[1] = a | |
73 return True | |
74 | |
75 def evenoddpair(last, a, b, r): | |
76 if a != last+2: | |
77 return False | |
78 delta = r[2] | |
79 d = delta | |
80 if type(delta) is not str: | |
81 return False | |
82 if delta.endswith('Skip'): | |
83 d = delta[:-4] | |
84 else: | |
85 delta = d + 'Skip' | |
86 if b != _AddDelta(a, d): | |
87 return False | |
88 r[1] = a | |
89 r[2] = delta | |
90 return True | |
91 | |
92 for a, b in pairs: | |
93 if ranges and evenodd(last, a, b, ranges[-1]): | |
94 pass | |
95 elif ranges and evenoddpair(last, a, b, ranges[-1]): | |
96 pass | |
97 else: | |
98 ranges.append([a, a, _Delta(a, b)]) | |
99 last = a | |
100 return ranges | |
101 | |
102 # The maximum size of a case-folding group. | |
103 # Case folding is implemented in parse.cc by a recursive process | |
104 # with a recursion depth equal to the size of the largest | |
105 # case-folding group, so it is important that this bound be small. | |
106 # The current tables have no group bigger than 4. | |
107 # If there are ever groups bigger than 10 or so, it will be | |
108 # time to rework the code in parse.cc. | |
109 MaxCasefoldGroup = 4 | |
110 | |
111 def main(): | |
112 lowergroups, casegroups = unicode.CaseGroups() | |
113 foldpairs = [] | |
114 seen = {} | |
115 for c in casegroups: | |
116 if len(c) > MaxCasefoldGroup: | |
117 raise unicode.Error("casefold group too long: %s" % (c,)) | |
118 for i in range(len(c)): | |
119 if c[i-1] in seen: | |
120 raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i])) | |
121 seen[c[i-1]] = True | |
122 foldpairs.append([c[i-1], c[i]]) | |
123 | |
124 lowerpairs = [] | |
125 for lower, group in lowergroups.iteritems(): | |
126 for g in group: | |
127 if g != lower: | |
128 lowerpairs.append([g, lower]) | |
129 | |
130 def printpairs(name, foldpairs): | |
131 foldpairs.sort() | |
132 foldranges = _MakeRanges(foldpairs) | |
133 print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs)
, len(foldranges)) | |
134 print "const CaseFold unicode_%s[] = {" % (name,) | |
135 for lo, hi, delta in foldranges: | |
136 print "\t{ %d, %d, %s }," % (lo, hi, delta) | |
137 print "};" | |
138 print "const int num_unicode_%s = %d;" % (name, len(foldranges),) | |
139 print "" | |
140 | |
141 print _header | |
142 printpairs("casefold", foldpairs) | |
143 printpairs("tolower", lowerpairs) | |
144 print _trailer | |
145 | |
146 if __name__ == '__main__': | |
147 main() | |
OLD | NEW |