Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(4)

Side by Side Diff: third_party/re2/re2/unicode.py

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 4 years, 12 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/re2/re2/tostring.cc ('k') | third_party/re2/re2/unicode_casefold.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 # Copyright 2008 The RE2 Authors. All Rights Reserved.
2 # Use of this source code is governed by a BSD-style
3 # license that can be found in the LICENSE file.
4
5 """Parser for Unicode data files (as distributed by unicode.org)."""
6
7 import os
8 import re
9 import urllib2
10
11 # Directory or URL where Unicode tables reside.
12 _UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd"
13
14 # Largest valid Unicode code value.
15 _RUNE_MAX = 0x10FFFF
16
17
18 class Error(Exception):
19 """Unicode error base class."""
20
21
22 class InputError(Error):
23 """Unicode input error class. Raised on invalid input."""
24
25
26 def _UInt(s):
27 """Converts string to Unicode code point ('263A' => 0x263a).
28
29 Args:
30 s: string to convert
31
32 Returns:
33 Unicode code point
34
35 Raises:
36 InputError: the string is not a valid Unicode value.
37 """
38
39 try:
40 v = int(s, 16)
41 except ValueError:
42 v = -1
43 if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
44 raise InputError("invalid Unicode value %s" % (s,))
45 return v
46
47
48 def _URange(s):
49 """Converts string to Unicode range.
50
51 '0001..0003' => [1, 2, 3].
52 '0001' => [1].
53
54 Args:
55 s: string to convert
56
57 Returns:
58 Unicode range
59
60 Raises:
61 InputError: the string is not a valid Unicode range.
62 """
63 a = s.split("..")
64 if len(a) == 1:
65 return [_UInt(a[0])]
66 if len(a) == 2:
67 lo = _UInt(a[0])
68 hi = _UInt(a[1])
69 if lo < hi:
70 return range(lo, hi + 1)
71 raise InputError("invalid Unicode range %s" % (s,))
72
73
74 def _UStr(v):
75 """Converts Unicode code point to hex string.
76
77 0x263a => '0x263A'.
78
79 Args:
80 v: code point to convert
81
82 Returns:
83 Unicode string
84
85 Raises:
86 InputError: the argument is not a valid Unicode value.
87 """
88 if v < 0 or v > _RUNE_MAX:
89 raise InputError("invalid Unicode value %s" % (v,))
90 return "0x%04X" % (v,)
91
92
93 def _ParseContinue(s):
94 """Parses a Unicode continuation field.
95
96 These are of the form '<Name, First>' or '<Name, Last>'.
97 Instead of giving an explicit range in a single table entry,
98 some Unicode tables use two entries, one for the first
99 code value in the range and one for the last.
100 The first entry's description is '<Name, First>' instead of 'Name'
101 and the second is '<Name, Last>'.
102
103 '<Name, First>' => ('Name', 'First')
104 '<Name, Last>' => ('Name', 'Last')
105 'Anything else' => ('Anything else', None)
106
107 Args:
108 s: continuation field string
109
110 Returns:
111 pair: name and ('First', 'Last', or None)
112 """
113
114 match = re.match("<(.*), (First|Last)>", s)
115 if match is not None:
116 return match.groups()
117 return (s, None)
118
119
120 def ReadUnicodeTable(filename, nfields, doline):
121 """Generic Unicode table text file reader.
122
123 The reader takes care of stripping out comments and also
124 parsing the two different ways that the Unicode tables specify
125 code ranges (using the .. notation and splitting the range across
126 multiple lines).
127
128 Each non-comment line in the table is expected to have the given
129 number of fields. The first field is known to be the Unicode value
130 and the second field its description.
131
132 The reader calls doline(codes, fields) for each entry in the table.
133 If fn raises an exception, the reader prints that exception,
134 prefixed with the file name and line number, and continues
135 processing the file. When done with the file, the reader re-raises
136 the first exception encountered during the file.
137
138 Arguments:
139 filename: the Unicode data file to read, or a file-like object.
140 nfields: the number of expected fields per line in that file.
141 doline: the function to call for each table entry.
142
143 Raises:
144 InputError: nfields is invalid (must be >= 2).
145 """
146
147 if nfields < 2:
148 raise InputError("invalid number of fields %d" % (nfields,))
149
150 if type(filename) == str:
151 if filename.startswith("http://"):
152 fil = urllib2.urlopen(filename)
153 else:
154 fil = open(filename, "r")
155 else:
156 fil = filename
157
158 first = None # first code in multiline range
159 expect_last = None # tag expected for "Last" line in multiline range
160 lineno = 0 # current line number
161 for line in fil:
162 lineno += 1
163 try:
164 # Chop # comments and white space; ignore empty lines.
165 sharp = line.find("#")
166 if sharp >= 0:
167 line = line[:sharp]
168 line = line.strip()
169 if not line:
170 continue
171
172 # Split fields on ";", chop more white space.
173 # Must have the expected number of fields.
174 fields = [s.strip() for s in line.split(";")]
175 if len(fields) != nfields:
176 raise InputError("wrong number of fields %d %d - %s" %
177 (len(fields), nfields, line))
178
179 # The Unicode text files have two different ways
180 # to list a Unicode range. Either the first field is
181 # itself a range (0000..FFFF), or the range is split
182 # across two lines, with the second field noting
183 # the continuation.
184 codes = _URange(fields[0])
185 (name, cont) = _ParseContinue(fields[1])
186
187 if expect_last is not None:
188 # If the last line gave the First code in a range,
189 # this one had better give the Last one.
190 if (len(codes) != 1 or codes[0] <= first or
191 cont != "Last" or name != expect_last):
192 raise InputError("expected Last line for %s" %
193 (expect_last,))
194 codes = range(first, codes[0] + 1)
195 first = None
196 expect_last = None
197 fields[0] = "%04X..%04X" % (codes[0], codes[-1])
198 fields[1] = name
199 elif cont == "First":
200 # Otherwise, if this is the First code in a range,
201 # remember it and go to the next line.
202 if len(codes) != 1:
203 raise InputError("bad First line: range given")
204 expect_last = name
205 first = codes[0]
206 continue
207
208 doline(codes, fields)
209
210 except Exception, e:
211 print "%s:%d: %s" % (filename, lineno, e)
212 raise
213
214 if expect_last is not None:
215 raise InputError("expected Last line for %s; got EOF" %
216 (expect_last,))
217
218
219 def CaseGroups(unicode_dir=_UNICODE_DIR):
220 """Returns list of Unicode code groups equivalent under case folding.
221
222 Each group is a sorted list of code points,
223 and the list of groups is sorted by first code point
224 in the group.
225
226 Args:
227 unicode_dir: Unicode data directory
228
229 Returns:
230 list of Unicode code groups
231 """
232
233 # Dict mapping lowercase code point to fold-equivalent group.
234 togroup = {}
235
236 def DoLine(codes, fields):
237 """Process single CaseFolding.txt line, updating togroup."""
238 (_, foldtype, lower, _) = fields
239 if foldtype not in ("C", "S"):
240 return
241 lower = _UInt(lower)
242 togroup.setdefault(lower, [lower]).extend(codes)
243
244 ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
245
246 groups = togroup.values()
247 for g in groups:
248 g.sort()
249 groups.sort()
250 return togroup, groups
251
252
253 def Scripts(unicode_dir=_UNICODE_DIR):
254 """Returns dict mapping script names to code lists.
255
256 Args:
257 unicode_dir: Unicode data directory
258
259 Returns:
260 dict mapping script names to code lists
261 """
262
263 scripts = {}
264
265 def DoLine(codes, fields):
266 """Process single Scripts.txt line, updating scripts."""
267 (_, name) = fields
268 scripts.setdefault(name, []).extend(codes)
269
270 ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
271 return scripts
272
273
274 def Categories(unicode_dir=_UNICODE_DIR):
275 """Returns dict mapping category names to code lists.
276
277 Args:
278 unicode_dir: Unicode data directory
279
280 Returns:
281 dict mapping category names to code lists
282 """
283
284 categories = {}
285
286 def DoLine(codes, fields):
287 """Process single UnicodeData.txt line, updating categories."""
288 category = fields[2]
289 categories.setdefault(category, []).extend(codes)
290 # Add codes from Lu into L, etc.
291 if len(category) > 1:
292 short = category[0]
293 categories.setdefault(short, []).extend(codes)
294
295 ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
296 return categories
297
OLDNEW
« no previous file with comments | « third_party/re2/re2/tostring.cc ('k') | third_party/re2/re2/unicode_casefold.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698