OLD | NEW |
| (Empty) |
1 # Copyright 2008 The RE2 Authors. All Rights Reserved. | |
2 # Use of this source code is governed by a BSD-style | |
3 # license that can be found in the LICENSE file. | |
4 | |
5 """Parser for Unicode data files (as distributed by unicode.org).""" | |
6 | |
7 import os | |
8 import re | |
9 import urllib2 | |
10 | |
11 # Directory or URL where Unicode tables reside. | |
12 _UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd" | |
13 | |
14 # Largest valid Unicode code value. | |
15 _RUNE_MAX = 0x10FFFF | |
16 | |
17 | |
18 class Error(Exception): | |
19 """Unicode error base class.""" | |
20 | |
21 | |
22 class InputError(Error): | |
23 """Unicode input error class. Raised on invalid input.""" | |
24 | |
25 | |
26 def _UInt(s): | |
27 """Converts string to Unicode code point ('263A' => 0x263a). | |
28 | |
29 Args: | |
30 s: string to convert | |
31 | |
32 Returns: | |
33 Unicode code point | |
34 | |
35 Raises: | |
36 InputError: the string is not a valid Unicode value. | |
37 """ | |
38 | |
39 try: | |
40 v = int(s, 16) | |
41 except ValueError: | |
42 v = -1 | |
43 if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX: | |
44 raise InputError("invalid Unicode value %s" % (s,)) | |
45 return v | |
46 | |
47 | |
48 def _URange(s): | |
49 """Converts string to Unicode range. | |
50 | |
51 '0001..0003' => [1, 2, 3]. | |
52 '0001' => [1]. | |
53 | |
54 Args: | |
55 s: string to convert | |
56 | |
57 Returns: | |
58 Unicode range | |
59 | |
60 Raises: | |
61 InputError: the string is not a valid Unicode range. | |
62 """ | |
63 a = s.split("..") | |
64 if len(a) == 1: | |
65 return [_UInt(a[0])] | |
66 if len(a) == 2: | |
67 lo = _UInt(a[0]) | |
68 hi = _UInt(a[1]) | |
69 if lo < hi: | |
70 return range(lo, hi + 1) | |
71 raise InputError("invalid Unicode range %s" % (s,)) | |
72 | |
73 | |
74 def _UStr(v): | |
75 """Converts Unicode code point to hex string. | |
76 | |
77 0x263a => '0x263A'. | |
78 | |
79 Args: | |
80 v: code point to convert | |
81 | |
82 Returns: | |
83 Unicode string | |
84 | |
85 Raises: | |
86 InputError: the argument is not a valid Unicode value. | |
87 """ | |
88 if v < 0 or v > _RUNE_MAX: | |
89 raise InputError("invalid Unicode value %s" % (v,)) | |
90 return "0x%04X" % (v,) | |
91 | |
92 | |
93 def _ParseContinue(s): | |
94 """Parses a Unicode continuation field. | |
95 | |
96 These are of the form '<Name, First>' or '<Name, Last>'. | |
97 Instead of giving an explicit range in a single table entry, | |
98 some Unicode tables use two entries, one for the first | |
99 code value in the range and one for the last. | |
100 The first entry's description is '<Name, First>' instead of 'Name' | |
101 and the second is '<Name, Last>'. | |
102 | |
103 '<Name, First>' => ('Name', 'First') | |
104 '<Name, Last>' => ('Name', 'Last') | |
105 'Anything else' => ('Anything else', None) | |
106 | |
107 Args: | |
108 s: continuation field string | |
109 | |
110 Returns: | |
111 pair: name and ('First', 'Last', or None) | |
112 """ | |
113 | |
114 match = re.match("<(.*), (First|Last)>", s) | |
115 if match is not None: | |
116 return match.groups() | |
117 return (s, None) | |
118 | |
119 | |
120 def ReadUnicodeTable(filename, nfields, doline): | |
121 """Generic Unicode table text file reader. | |
122 | |
123 The reader takes care of stripping out comments and also | |
124 parsing the two different ways that the Unicode tables specify | |
125 code ranges (using the .. notation and splitting the range across | |
126 multiple lines). | |
127 | |
128 Each non-comment line in the table is expected to have the given | |
129 number of fields. The first field is known to be the Unicode value | |
130 and the second field its description. | |
131 | |
132 The reader calls doline(codes, fields) for each entry in the table. | |
133 If fn raises an exception, the reader prints that exception, | |
134 prefixed with the file name and line number, and continues | |
135 processing the file. When done with the file, the reader re-raises | |
136 the first exception encountered during the file. | |
137 | |
138 Arguments: | |
139 filename: the Unicode data file to read, or a file-like object. | |
140 nfields: the number of expected fields per line in that file. | |
141 doline: the function to call for each table entry. | |
142 | |
143 Raises: | |
144 InputError: nfields is invalid (must be >= 2). | |
145 """ | |
146 | |
147 if nfields < 2: | |
148 raise InputError("invalid number of fields %d" % (nfields,)) | |
149 | |
150 if type(filename) == str: | |
151 if filename.startswith("http://"): | |
152 fil = urllib2.urlopen(filename) | |
153 else: | |
154 fil = open(filename, "r") | |
155 else: | |
156 fil = filename | |
157 | |
158 first = None # first code in multiline range | |
159 expect_last = None # tag expected for "Last" line in multiline range | |
160 lineno = 0 # current line number | |
161 for line in fil: | |
162 lineno += 1 | |
163 try: | |
164 # Chop # comments and white space; ignore empty lines. | |
165 sharp = line.find("#") | |
166 if sharp >= 0: | |
167 line = line[:sharp] | |
168 line = line.strip() | |
169 if not line: | |
170 continue | |
171 | |
172 # Split fields on ";", chop more white space. | |
173 # Must have the expected number of fields. | |
174 fields = [s.strip() for s in line.split(";")] | |
175 if len(fields) != nfields: | |
176 raise InputError("wrong number of fields %d %d - %s" % | |
177 (len(fields), nfields, line)) | |
178 | |
179 # The Unicode text files have two different ways | |
180 # to list a Unicode range. Either the first field is | |
181 # itself a range (0000..FFFF), or the range is split | |
182 # across two lines, with the second field noting | |
183 # the continuation. | |
184 codes = _URange(fields[0]) | |
185 (name, cont) = _ParseContinue(fields[1]) | |
186 | |
187 if expect_last is not None: | |
188 # If the last line gave the First code in a range, | |
189 # this one had better give the Last one. | |
190 if (len(codes) != 1 or codes[0] <= first or | |
191 cont != "Last" or name != expect_last): | |
192 raise InputError("expected Last line for %s" % | |
193 (expect_last,)) | |
194 codes = range(first, codes[0] + 1) | |
195 first = None | |
196 expect_last = None | |
197 fields[0] = "%04X..%04X" % (codes[0], codes[-1]) | |
198 fields[1] = name | |
199 elif cont == "First": | |
200 # Otherwise, if this is the First code in a range, | |
201 # remember it and go to the next line. | |
202 if len(codes) != 1: | |
203 raise InputError("bad First line: range given") | |
204 expect_last = name | |
205 first = codes[0] | |
206 continue | |
207 | |
208 doline(codes, fields) | |
209 | |
210 except Exception, e: | |
211 print "%s:%d: %s" % (filename, lineno, e) | |
212 raise | |
213 | |
214 if expect_last is not None: | |
215 raise InputError("expected Last line for %s; got EOF" % | |
216 (expect_last,)) | |
217 | |
218 | |
219 def CaseGroups(unicode_dir=_UNICODE_DIR): | |
220 """Returns list of Unicode code groups equivalent under case folding. | |
221 | |
222 Each group is a sorted list of code points, | |
223 and the list of groups is sorted by first code point | |
224 in the group. | |
225 | |
226 Args: | |
227 unicode_dir: Unicode data directory | |
228 | |
229 Returns: | |
230 list of Unicode code groups | |
231 """ | |
232 | |
233 # Dict mapping lowercase code point to fold-equivalent group. | |
234 togroup = {} | |
235 | |
236 def DoLine(codes, fields): | |
237 """Process single CaseFolding.txt line, updating togroup.""" | |
238 (_, foldtype, lower, _) = fields | |
239 if foldtype not in ("C", "S"): | |
240 return | |
241 lower = _UInt(lower) | |
242 togroup.setdefault(lower, [lower]).extend(codes) | |
243 | |
244 ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine) | |
245 | |
246 groups = togroup.values() | |
247 for g in groups: | |
248 g.sort() | |
249 groups.sort() | |
250 return togroup, groups | |
251 | |
252 | |
253 def Scripts(unicode_dir=_UNICODE_DIR): | |
254 """Returns dict mapping script names to code lists. | |
255 | |
256 Args: | |
257 unicode_dir: Unicode data directory | |
258 | |
259 Returns: | |
260 dict mapping script names to code lists | |
261 """ | |
262 | |
263 scripts = {} | |
264 | |
265 def DoLine(codes, fields): | |
266 """Process single Scripts.txt line, updating scripts.""" | |
267 (_, name) = fields | |
268 scripts.setdefault(name, []).extend(codes) | |
269 | |
270 ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine) | |
271 return scripts | |
272 | |
273 | |
274 def Categories(unicode_dir=_UNICODE_DIR): | |
275 """Returns dict mapping category names to code lists. | |
276 | |
277 Args: | |
278 unicode_dir: Unicode data directory | |
279 | |
280 Returns: | |
281 dict mapping category names to code lists | |
282 """ | |
283 | |
284 categories = {} | |
285 | |
286 def DoLine(codes, fields): | |
287 """Process single UnicodeData.txt line, updating categories.""" | |
288 category = fields[2] | |
289 categories.setdefault(category, []).extend(codes) | |
290 # Add codes from Lu into L, etc. | |
291 if len(category) > 1: | |
292 short = category[0] | |
293 categories.setdefault(short, []).extend(codes) | |
294 | |
295 ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine) | |
296 return categories | |
297 | |
OLD | NEW |