| OLD | NEW |
| (Empty) |
| 1 # Copyright 2008 The RE2 Authors. All Rights Reserved. | |
| 2 # Use of this source code is governed by a BSD-style | |
| 3 # license that can be found in the LICENSE file. | |
| 4 | |
| 5 """Parser for Unicode data files (as distributed by unicode.org).""" | |
| 6 | |
| 7 import os | |
| 8 import re | |
| 9 import urllib2 | |
| 10 | |
| 11 # Directory or URL where Unicode tables reside. | |
| 12 _UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd" | |
| 13 | |
| 14 # Largest valid Unicode code value. | |
| 15 _RUNE_MAX = 0x10FFFF | |
| 16 | |
| 17 | |
| 18 class Error(Exception): | |
| 19 """Unicode error base class.""" | |
| 20 | |
| 21 | |
| 22 class InputError(Error): | |
| 23 """Unicode input error class. Raised on invalid input.""" | |
| 24 | |
| 25 | |
| 26 def _UInt(s): | |
| 27 """Converts string to Unicode code point ('263A' => 0x263a). | |
| 28 | |
| 29 Args: | |
| 30 s: string to convert | |
| 31 | |
| 32 Returns: | |
| 33 Unicode code point | |
| 34 | |
| 35 Raises: | |
| 36 InputError: the string is not a valid Unicode value. | |
| 37 """ | |
| 38 | |
| 39 try: | |
| 40 v = int(s, 16) | |
| 41 except ValueError: | |
| 42 v = -1 | |
| 43 if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX: | |
| 44 raise InputError("invalid Unicode value %s" % (s,)) | |
| 45 return v | |
| 46 | |
| 47 | |
| 48 def _URange(s): | |
| 49 """Converts string to Unicode range. | |
| 50 | |
| 51 '0001..0003' => [1, 2, 3]. | |
| 52 '0001' => [1]. | |
| 53 | |
| 54 Args: | |
| 55 s: string to convert | |
| 56 | |
| 57 Returns: | |
| 58 Unicode range | |
| 59 | |
| 60 Raises: | |
| 61 InputError: the string is not a valid Unicode range. | |
| 62 """ | |
| 63 a = s.split("..") | |
| 64 if len(a) == 1: | |
| 65 return [_UInt(a[0])] | |
| 66 if len(a) == 2: | |
| 67 lo = _UInt(a[0]) | |
| 68 hi = _UInt(a[1]) | |
| 69 if lo < hi: | |
| 70 return range(lo, hi + 1) | |
| 71 raise InputError("invalid Unicode range %s" % (s,)) | |
| 72 | |
| 73 | |
| 74 def _UStr(v): | |
| 75 """Converts Unicode code point to hex string. | |
| 76 | |
| 77 0x263a => '0x263A'. | |
| 78 | |
| 79 Args: | |
| 80 v: code point to convert | |
| 81 | |
| 82 Returns: | |
| 83 Unicode string | |
| 84 | |
| 85 Raises: | |
| 86 InputError: the argument is not a valid Unicode value. | |
| 87 """ | |
| 88 if v < 0 or v > _RUNE_MAX: | |
| 89 raise InputError("invalid Unicode value %s" % (v,)) | |
| 90 return "0x%04X" % (v,) | |
| 91 | |
| 92 | |
| 93 def _ParseContinue(s): | |
| 94 """Parses a Unicode continuation field. | |
| 95 | |
| 96 These are of the form '<Name, First>' or '<Name, Last>'. | |
| 97 Instead of giving an explicit range in a single table entry, | |
| 98 some Unicode tables use two entries, one for the first | |
| 99 code value in the range and one for the last. | |
| 100 The first entry's description is '<Name, First>' instead of 'Name' | |
| 101 and the second is '<Name, Last>'. | |
| 102 | |
| 103 '<Name, First>' => ('Name', 'First') | |
| 104 '<Name, Last>' => ('Name', 'Last') | |
| 105 'Anything else' => ('Anything else', None) | |
| 106 | |
| 107 Args: | |
| 108 s: continuation field string | |
| 109 | |
| 110 Returns: | |
| 111 pair: name and ('First', 'Last', or None) | |
| 112 """ | |
| 113 | |
| 114 match = re.match("<(.*), (First|Last)>", s) | |
| 115 if match is not None: | |
| 116 return match.groups() | |
| 117 return (s, None) | |
| 118 | |
| 119 | |
| 120 def ReadUnicodeTable(filename, nfields, doline): | |
| 121 """Generic Unicode table text file reader. | |
| 122 | |
| 123 The reader takes care of stripping out comments and also | |
| 124 parsing the two different ways that the Unicode tables specify | |
| 125 code ranges (using the .. notation and splitting the range across | |
| 126 multiple lines). | |
| 127 | |
| 128 Each non-comment line in the table is expected to have the given | |
| 129 number of fields. The first field is known to be the Unicode value | |
| 130 and the second field its description. | |
| 131 | |
| 132 The reader calls doline(codes, fields) for each entry in the table. | |
| 133 If fn raises an exception, the reader prints that exception, | |
| 134 prefixed with the file name and line number, and continues | |
| 135 processing the file. When done with the file, the reader re-raises | |
| 136 the first exception encountered during the file. | |
| 137 | |
| 138 Arguments: | |
| 139 filename: the Unicode data file to read, or a file-like object. | |
| 140 nfields: the number of expected fields per line in that file. | |
| 141 doline: the function to call for each table entry. | |
| 142 | |
| 143 Raises: | |
| 144 InputError: nfields is invalid (must be >= 2). | |
| 145 """ | |
| 146 | |
| 147 if nfields < 2: | |
| 148 raise InputError("invalid number of fields %d" % (nfields,)) | |
| 149 | |
| 150 if type(filename) == str: | |
| 151 if filename.startswith("http://"): | |
| 152 fil = urllib2.urlopen(filename) | |
| 153 else: | |
| 154 fil = open(filename, "r") | |
| 155 else: | |
| 156 fil = filename | |
| 157 | |
| 158 first = None # first code in multiline range | |
| 159 expect_last = None # tag expected for "Last" line in multiline range | |
| 160 lineno = 0 # current line number | |
| 161 for line in fil: | |
| 162 lineno += 1 | |
| 163 try: | |
| 164 # Chop # comments and white space; ignore empty lines. | |
| 165 sharp = line.find("#") | |
| 166 if sharp >= 0: | |
| 167 line = line[:sharp] | |
| 168 line = line.strip() | |
| 169 if not line: | |
| 170 continue | |
| 171 | |
| 172 # Split fields on ";", chop more white space. | |
| 173 # Must have the expected number of fields. | |
| 174 fields = [s.strip() for s in line.split(";")] | |
| 175 if len(fields) != nfields: | |
| 176 raise InputError("wrong number of fields %d %d - %s" % | |
| 177 (len(fields), nfields, line)) | |
| 178 | |
| 179 # The Unicode text files have two different ways | |
| 180 # to list a Unicode range. Either the first field is | |
| 181 # itself a range (0000..FFFF), or the range is split | |
| 182 # across two lines, with the second field noting | |
| 183 # the continuation. | |
| 184 codes = _URange(fields[0]) | |
| 185 (name, cont) = _ParseContinue(fields[1]) | |
| 186 | |
| 187 if expect_last is not None: | |
| 188 # If the last line gave the First code in a range, | |
| 189 # this one had better give the Last one. | |
| 190 if (len(codes) != 1 or codes[0] <= first or | |
| 191 cont != "Last" or name != expect_last): | |
| 192 raise InputError("expected Last line for %s" % | |
| 193 (expect_last,)) | |
| 194 codes = range(first, codes[0] + 1) | |
| 195 first = None | |
| 196 expect_last = None | |
| 197 fields[0] = "%04X..%04X" % (codes[0], codes[-1]) | |
| 198 fields[1] = name | |
| 199 elif cont == "First": | |
| 200 # Otherwise, if this is the First code in a range, | |
| 201 # remember it and go to the next line. | |
| 202 if len(codes) != 1: | |
| 203 raise InputError("bad First line: range given") | |
| 204 expect_last = name | |
| 205 first = codes[0] | |
| 206 continue | |
| 207 | |
| 208 doline(codes, fields) | |
| 209 | |
| 210 except Exception, e: | |
| 211 print "%s:%d: %s" % (filename, lineno, e) | |
| 212 raise | |
| 213 | |
| 214 if expect_last is not None: | |
| 215 raise InputError("expected Last line for %s; got EOF" % | |
| 216 (expect_last,)) | |
| 217 | |
| 218 | |
| 219 def CaseGroups(unicode_dir=_UNICODE_DIR): | |
| 220 """Returns list of Unicode code groups equivalent under case folding. | |
| 221 | |
| 222 Each group is a sorted list of code points, | |
| 223 and the list of groups is sorted by first code point | |
| 224 in the group. | |
| 225 | |
| 226 Args: | |
| 227 unicode_dir: Unicode data directory | |
| 228 | |
| 229 Returns: | |
| 230 list of Unicode code groups | |
| 231 """ | |
| 232 | |
| 233 # Dict mapping lowercase code point to fold-equivalent group. | |
| 234 togroup = {} | |
| 235 | |
| 236 def DoLine(codes, fields): | |
| 237 """Process single CaseFolding.txt line, updating togroup.""" | |
| 238 (_, foldtype, lower, _) = fields | |
| 239 if foldtype not in ("C", "S"): | |
| 240 return | |
| 241 lower = _UInt(lower) | |
| 242 togroup.setdefault(lower, [lower]).extend(codes) | |
| 243 | |
| 244 ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine) | |
| 245 | |
| 246 groups = togroup.values() | |
| 247 for g in groups: | |
| 248 g.sort() | |
| 249 groups.sort() | |
| 250 return togroup, groups | |
| 251 | |
| 252 | |
| 253 def Scripts(unicode_dir=_UNICODE_DIR): | |
| 254 """Returns dict mapping script names to code lists. | |
| 255 | |
| 256 Args: | |
| 257 unicode_dir: Unicode data directory | |
| 258 | |
| 259 Returns: | |
| 260 dict mapping script names to code lists | |
| 261 """ | |
| 262 | |
| 263 scripts = {} | |
| 264 | |
| 265 def DoLine(codes, fields): | |
| 266 """Process single Scripts.txt line, updating scripts.""" | |
| 267 (_, name) = fields | |
| 268 scripts.setdefault(name, []).extend(codes) | |
| 269 | |
| 270 ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine) | |
| 271 return scripts | |
| 272 | |
| 273 | |
| 274 def Categories(unicode_dir=_UNICODE_DIR): | |
| 275 """Returns dict mapping category names to code lists. | |
| 276 | |
| 277 Args: | |
| 278 unicode_dir: Unicode data directory | |
| 279 | |
| 280 Returns: | |
| 281 dict mapping category names to code lists | |
| 282 """ | |
| 283 | |
| 284 categories = {} | |
| 285 | |
| 286 def DoLine(codes, fields): | |
| 287 """Process single UnicodeData.txt line, updating categories.""" | |
| 288 category = fields[2] | |
| 289 categories.setdefault(category, []).extend(codes) | |
| 290 # Add codes from Lu into L, etc. | |
| 291 if len(category) > 1: | |
| 292 short = category[0] | |
| 293 categories.setdefault(short, []).extend(codes) | |
| 294 | |
| 295 ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine) | |
| 296 return categories | |
| 297 | |
| OLD | NEW |