| Index: third_party/re2/re2/unicode.py
|
| diff --git a/third_party/re2/re2/unicode.py b/third_party/re2/re2/unicode.py
|
| deleted file mode 100644
|
| index 6dfe87bbcef6e4b9acc1bdb4d39eb7faec18275d..0000000000000000000000000000000000000000
|
| --- a/third_party/re2/re2/unicode.py
|
| +++ /dev/null
|
| @@ -1,297 +0,0 @@
|
| -# Copyright 2008 The RE2 Authors. All Rights Reserved.
|
| -# Use of this source code is governed by a BSD-style
|
| -# license that can be found in the LICENSE file.
|
| -
|
| -"""Parser for Unicode data files (as distributed by unicode.org)."""
|
| -
|
| -import os
|
| -import re
|
| -import urllib2
|
| -
|
| -# Directory or URL where Unicode tables reside.
|
| -_UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd"
|
| -
|
| -# Largest valid Unicode code value.
|
| -_RUNE_MAX = 0x10FFFF
|
| -
|
| -
|
| -class Error(Exception):
|
| - """Unicode error base class."""
|
| -
|
| -
|
| -class InputError(Error):
|
| - """Unicode input error class. Raised on invalid input."""
|
| -
|
| -
|
| -def _UInt(s):
|
| - """Converts string to Unicode code point ('263A' => 0x263a).
|
| -
|
| - Args:
|
| - s: string to convert
|
| -
|
| - Returns:
|
| - Unicode code point
|
| -
|
| - Raises:
|
| - InputError: the string is not a valid Unicode value.
|
| - """
|
| -
|
| - try:
|
| - v = int(s, 16)
|
| - except ValueError:
|
| - v = -1
|
| - if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
|
| - raise InputError("invalid Unicode value %s" % (s,))
|
| - return v
|
| -
|
| -
|
| -def _URange(s):
|
| - """Converts string to Unicode range.
|
| -
|
| - '0001..0003' => [1, 2, 3].
|
| - '0001' => [1].
|
| -
|
| - Args:
|
| - s: string to convert
|
| -
|
| - Returns:
|
| - Unicode range
|
| -
|
| - Raises:
|
| - InputError: the string is not a valid Unicode range.
|
| - """
|
| - a = s.split("..")
|
| - if len(a) == 1:
|
| - return [_UInt(a[0])]
|
| - if len(a) == 2:
|
| - lo = _UInt(a[0])
|
| - hi = _UInt(a[1])
|
| - if lo < hi:
|
| - return range(lo, hi + 1)
|
| - raise InputError("invalid Unicode range %s" % (s,))
|
| -
|
| -
|
| -def _UStr(v):
|
| - """Converts Unicode code point to hex string.
|
| -
|
| - 0x263a => '0x263A'.
|
| -
|
| - Args:
|
| - v: code point to convert
|
| -
|
| - Returns:
|
| - Unicode string
|
| -
|
| - Raises:
|
| - InputError: the argument is not a valid Unicode value.
|
| - """
|
| - if v < 0 or v > _RUNE_MAX:
|
| - raise InputError("invalid Unicode value %s" % (v,))
|
| - return "0x%04X" % (v,)
|
| -
|
| -
|
| -def _ParseContinue(s):
|
| - """Parses a Unicode continuation field.
|
| -
|
| - These are of the form '<Name, First>' or '<Name, Last>'.
|
| - Instead of giving an explicit range in a single table entry,
|
| - some Unicode tables use two entries, one for the first
|
| - code value in the range and one for the last.
|
| - The first entry's description is '<Name, First>' instead of 'Name'
|
| - and the second is '<Name, Last>'.
|
| -
|
| - '<Name, First>' => ('Name', 'First')
|
| - '<Name, Last>' => ('Name', 'Last')
|
| - 'Anything else' => ('Anything else', None)
|
| -
|
| - Args:
|
| - s: continuation field string
|
| -
|
| - Returns:
|
| - pair: name and ('First', 'Last', or None)
|
| - """
|
| -
|
| - match = re.match("<(.*), (First|Last)>", s)
|
| - if match is not None:
|
| - return match.groups()
|
| - return (s, None)
|
| -
|
| -
|
| -def ReadUnicodeTable(filename, nfields, doline):
|
| - """Generic Unicode table text file reader.
|
| -
|
| - The reader takes care of stripping out comments and also
|
| - parsing the two different ways that the Unicode tables specify
|
| - code ranges (using the .. notation and splitting the range across
|
| - multiple lines).
|
| -
|
| - Each non-comment line in the table is expected to have the given
|
| - number of fields. The first field is known to be the Unicode value
|
| - and the second field its description.
|
| -
|
| - The reader calls doline(codes, fields) for each entry in the table.
|
| - If fn raises an exception, the reader prints that exception,
|
| - prefixed with the file name and line number, and continues
|
| - processing the file. When done with the file, the reader re-raises
|
| - the first exception encountered during the file.
|
| -
|
| - Arguments:
|
| - filename: the Unicode data file to read, or a file-like object.
|
| - nfields: the number of expected fields per line in that file.
|
| - doline: the function to call for each table entry.
|
| -
|
| - Raises:
|
| - InputError: nfields is invalid (must be >= 2).
|
| - """
|
| -
|
| - if nfields < 2:
|
| - raise InputError("invalid number of fields %d" % (nfields,))
|
| -
|
| - if type(filename) == str:
|
| - if filename.startswith("http://"):
|
| - fil = urllib2.urlopen(filename)
|
| - else:
|
| - fil = open(filename, "r")
|
| - else:
|
| - fil = filename
|
| -
|
| - first = None # first code in multiline range
|
| - expect_last = None # tag expected for "Last" line in multiline range
|
| - lineno = 0 # current line number
|
| - for line in fil:
|
| - lineno += 1
|
| - try:
|
| - # Chop # comments and white space; ignore empty lines.
|
| - sharp = line.find("#")
|
| - if sharp >= 0:
|
| - line = line[:sharp]
|
| - line = line.strip()
|
| - if not line:
|
| - continue
|
| -
|
| - # Split fields on ";", chop more white space.
|
| - # Must have the expected number of fields.
|
| - fields = [s.strip() for s in line.split(";")]
|
| - if len(fields) != nfields:
|
| - raise InputError("wrong number of fields %d %d - %s" %
|
| - (len(fields), nfields, line))
|
| -
|
| - # The Unicode text files have two different ways
|
| - # to list a Unicode range. Either the first field is
|
| - # itself a range (0000..FFFF), or the range is split
|
| - # across two lines, with the second field noting
|
| - # the continuation.
|
| - codes = _URange(fields[0])
|
| - (name, cont) = _ParseContinue(fields[1])
|
| -
|
| - if expect_last is not None:
|
| - # If the last line gave the First code in a range,
|
| - # this one had better give the Last one.
|
| - if (len(codes) != 1 or codes[0] <= first or
|
| - cont != "Last" or name != expect_last):
|
| - raise InputError("expected Last line for %s" %
|
| - (expect_last,))
|
| - codes = range(first, codes[0] + 1)
|
| - first = None
|
| - expect_last = None
|
| - fields[0] = "%04X..%04X" % (codes[0], codes[-1])
|
| - fields[1] = name
|
| - elif cont == "First":
|
| - # Otherwise, if this is the First code in a range,
|
| - # remember it and go to the next line.
|
| - if len(codes) != 1:
|
| - raise InputError("bad First line: range given")
|
| - expect_last = name
|
| - first = codes[0]
|
| - continue
|
| -
|
| - doline(codes, fields)
|
| -
|
| - except Exception, e:
|
| - print "%s:%d: %s" % (filename, lineno, e)
|
| - raise
|
| -
|
| - if expect_last is not None:
|
| - raise InputError("expected Last line for %s; got EOF" %
|
| - (expect_last,))
|
| -
|
| -
|
| -def CaseGroups(unicode_dir=_UNICODE_DIR):
|
| - """Returns list of Unicode code groups equivalent under case folding.
|
| -
|
| - Each group is a sorted list of code points,
|
| - and the list of groups is sorted by first code point
|
| - in the group.
|
| -
|
| - Args:
|
| - unicode_dir: Unicode data directory
|
| -
|
| - Returns:
|
| - list of Unicode code groups
|
| - """
|
| -
|
| - # Dict mapping lowercase code point to fold-equivalent group.
|
| - togroup = {}
|
| -
|
| - def DoLine(codes, fields):
|
| - """Process single CaseFolding.txt line, updating togroup."""
|
| - (_, foldtype, lower, _) = fields
|
| - if foldtype not in ("C", "S"):
|
| - return
|
| - lower = _UInt(lower)
|
| - togroup.setdefault(lower, [lower]).extend(codes)
|
| -
|
| - ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
|
| -
|
| - groups = togroup.values()
|
| - for g in groups:
|
| - g.sort()
|
| - groups.sort()
|
| - return togroup, groups
|
| -
|
| -
|
| -def Scripts(unicode_dir=_UNICODE_DIR):
|
| - """Returns dict mapping script names to code lists.
|
| -
|
| - Args:
|
| - unicode_dir: Unicode data directory
|
| -
|
| - Returns:
|
| - dict mapping script names to code lists
|
| - """
|
| -
|
| - scripts = {}
|
| -
|
| - def DoLine(codes, fields):
|
| - """Process single Scripts.txt line, updating scripts."""
|
| - (_, name) = fields
|
| - scripts.setdefault(name, []).extend(codes)
|
| -
|
| - ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
|
| - return scripts
|
| -
|
| -
|
| -def Categories(unicode_dir=_UNICODE_DIR):
|
| - """Returns dict mapping category names to code lists.
|
| -
|
| - Args:
|
| - unicode_dir: Unicode data directory
|
| -
|
| - Returns:
|
| - dict mapping category names to code lists
|
| - """
|
| -
|
| - categories = {}
|
| -
|
| - def DoLine(codes, fields):
|
| - """Process single UnicodeData.txt line, updating categories."""
|
| - category = fields[2]
|
| - categories.setdefault(category, []).extend(codes)
|
| - # Add codes from Lu into L, etc.
|
| - if len(category) > 1:
|
| - short = category[0]
|
| - categories.setdefault(short, []).extend(codes)
|
| -
|
| - ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
|
| - return categories
|
| -
|
|
|