third_party/re2/re2/unicode.py - Issue 1544433002: Replace RE2 import with a dependency

Unified Diff: third_party/re2/re2/unicode.py

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/re2/re2/unicode.py

diff --git a/third_party/re2/re2/unicode.py b/third_party/re2/re2/unicode.py

deleted file mode 100644

index 6dfe87bbcef6e4b9acc1bdb4d39eb7faec18275d..0000000000000000000000000000000000000000

--- a/third_party/re2/re2/unicode.py

+++ /dev/null

@@ -1,297 +0,0 @@

-# Use of this source code is governed by a BSD-style

-# license that can be found in the LICENSE file.

-"""Parser for Unicode data files (as distributed by unicode.org)."""

-import os

-import re

-import urllib2

-# Directory or URL where Unicode tables reside.

-_UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd"

-# Largest valid Unicode code value.

-_RUNE_MAX = 0x10FFFF

-class Error(Exception):

- """Unicode error base class."""

-class InputError(Error):

- """Unicode input error class. Raised on invalid input."""

-def _UInt(s):

- """Converts string to Unicode code point ('263A' => 0x263a).

- Args:

- s: string to convert

- Returns:

- Unicode code point

- Raises:

- InputError: the string is not a valid Unicode value.

- """

- try:

- v = int(s, 16)

- except ValueError:

- v = -1

- if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:

- raise InputError("invalid Unicode value %s" % (s,))

- return v

-def _URange(s):

- """Converts string to Unicode range.

- '0001..0003' => [1, 2, 3].

- '0001' => [1].

- Args:

- s: string to convert

- Returns:

- Unicode range

- Raises:

- InputError: the string is not a valid Unicode range.

- """

- a = s.split("..")

- if len(a) == 1:

- return [_UInt(a[0])]

- if len(a) == 2:

- lo = _UInt(a[0])

- hi = _UInt(a[1])

- if lo < hi:

- return range(lo, hi + 1)

- raise InputError("invalid Unicode range %s" % (s,))

-def _UStr(v):

- """Converts Unicode code point to hex string.

- 0x263a => '0x263A'.

- Args:

- v: code point to convert

- Returns:

- Unicode string

- Raises:

- InputError: the argument is not a valid Unicode value.

- """

- if v < 0 or v > _RUNE_MAX:

- raise InputError("invalid Unicode value %s" % (v,))

- return "0x%04X" % (v,)

-def _ParseContinue(s):

- """Parses a Unicode continuation field.

- These are of the form '<Name, First>' or '<Name, Last>'.

- Instead of giving an explicit range in a single table entry,

- some Unicode tables use two entries, one for the first

- code value in the range and one for the last.

- The first entry's description is '<Name, First>' instead of 'Name'

- and the second is '<Name, Last>'.

- '<Name, First>' => ('Name', 'First')

- '<Name, Last>' => ('Name', 'Last')

- 'Anything else' => ('Anything else', None)

- Args:

- s: continuation field string

- Returns:

- pair: name and ('First', 'Last', or None)

- """

- match = re.match("<(.*), (First|Last)>", s)

- if match is not None:

- return match.groups()

- return (s, None)

-def ReadUnicodeTable(filename, nfields, doline):

- """Generic Unicode table text file reader.

- The reader takes care of stripping out comments and also

- parsing the two different ways that the Unicode tables specify

- code ranges (using the .. notation and splitting the range across

- multiple lines).

- Each non-comment line in the table is expected to have the given

- number of fields. The first field is known to be the Unicode value

- and the second field its description.

- The reader calls doline(codes, fields) for each entry in the table.

- If fn raises an exception, the reader prints that exception,

- prefixed with the file name and line number, and continues

- processing the file. When done with the file, the reader re-raises

- the first exception encountered during the file.

- Arguments:

- filename: the Unicode data file to read, or a file-like object.

- nfields: the number of expected fields per line in that file.

- doline: the function to call for each table entry.

- Raises:

- InputError: nfields is invalid (must be >= 2).

- """

- if nfields < 2:

- raise InputError("invalid number of fields %d" % (nfields,))

- if type(filename) == str:

- if filename.startswith("http://"):

- fil = urllib2.urlopen(filename)

- else:

- fil = open(filename, "r")

- else:

- fil = filename

- first = None # first code in multiline range

- expect_last = None # tag expected for "Last" line in multiline range

- lineno = 0 # current line number

- for line in fil:

- lineno += 1

- try:

- # Chop # comments and white space; ignore empty lines.

- sharp = line.find("#")

- if sharp >= 0:

- line = line[:sharp]

- line = line.strip()

- if not line:

- continue

- # Split fields on ";", chop more white space.

- # Must have the expected number of fields.

- fields = [s.strip() for s in line.split(";")]

- if len(fields) != nfields:

- raise InputError("wrong number of fields %d %d - %s" %

- (len(fields), nfields, line))

- # The Unicode text files have two different ways

- # to list a Unicode range. Either the first field is

- # itself a range (0000..FFFF), or the range is split

- # across two lines, with the second field noting

- # the continuation.

- codes = _URange(fields[0])

- (name, cont) = _ParseContinue(fields[1])

- if expect_last is not None:

- # If the last line gave the First code in a range,

- # this one had better give the Last one.

- if (len(codes) != 1 or codes[0] <= first or

- cont != "Last" or name != expect_last):

- raise InputError("expected Last line for %s" %

- (expect_last,))

- codes = range(first, codes[0] + 1)

- first = None

- expect_last = None

- fields[0] = "%04X..%04X" % (codes[0], codes[-1])

- fields[1] = name

- elif cont == "First":

- # Otherwise, if this is the First code in a range,

- # remember it and go to the next line.

- if len(codes) != 1:

- raise InputError("bad First line: range given")

- expect_last = name

- first = codes[0]

- continue

- doline(codes, fields)

- except Exception, e:

- print "%s:%d: %s" % (filename, lineno, e)

- raise

- if expect_last is not None:

- raise InputError("expected Last line for %s; got EOF" %

- (expect_last,))

-def CaseGroups(unicode_dir=_UNICODE_DIR):

- """Returns list of Unicode code groups equivalent under case folding.

- Each group is a sorted list of code points,

- and the list of groups is sorted by first code point

- in the group.

- Args:

- unicode_dir: Unicode data directory

- Returns:

- list of Unicode code groups

- """

- # Dict mapping lowercase code point to fold-equivalent group.

- togroup = {}

- def DoLine(codes, fields):

- """Process single CaseFolding.txt line, updating togroup."""

- (_, foldtype, lower, _) = fields

- if foldtype not in ("C", "S"):

- return

- lower = _UInt(lower)

- togroup.setdefault(lower, [lower]).extend(codes)

- ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)

- groups = togroup.values()

- for g in groups:

- g.sort()

- groups.sort()

- return togroup, groups

-def Scripts(unicode_dir=_UNICODE_DIR):

- """Returns dict mapping script names to code lists.

- Args:

- unicode_dir: Unicode data directory

- Returns:

- dict mapping script names to code lists

- """

- scripts = {}

- def DoLine(codes, fields):

- """Process single Scripts.txt line, updating scripts."""

- (_, name) = fields

- scripts.setdefault(name, []).extend(codes)

- ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)

- return scripts

-def Categories(unicode_dir=_UNICODE_DIR):

- """Returns dict mapping category names to code lists.

- Args:

- unicode_dir: Unicode data directory

- Returns:

- dict mapping category names to code lists

- """

- categories = {}

- def DoLine(codes, fields):

- """Process single UnicodeData.txt line, updating categories."""

- category = fields[2]

- categories.setdefault(category, []).extend(codes)

- # Add codes from Lu into L, etc.

- if len(category) > 1:

- short = category[0]

- categories.setdefault(short, []).extend(codes)

- ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)

- return categories

« no previous file with comments | « third_party/re2/re2/tostring.cc ('k') | third_party/re2/re2/unicode_casefold.h » ('j') | no next file with comments »