Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(13)

Unified Diff: third_party/re2/re2/unicode.py

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/re2/re2/tostring.cc ('k') | third_party/re2/re2/unicode_casefold.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/re2/re2/unicode.py
diff --git a/third_party/re2/re2/unicode.py b/third_party/re2/re2/unicode.py
deleted file mode 100644
index 6dfe87bbcef6e4b9acc1bdb4d39eb7faec18275d..0000000000000000000000000000000000000000
--- a/third_party/re2/re2/unicode.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# Copyright 2008 The RE2 Authors. All Rights Reserved.
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-"""Parser for Unicode data files (as distributed by unicode.org)."""
-
-import os
-import re
-import urllib2
-
-# Directory or URL where Unicode tables reside.
-_UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd"
-
-# Largest valid Unicode code value.
-_RUNE_MAX = 0x10FFFF
-
-
-class Error(Exception):
- """Unicode error base class."""
-
-
-class InputError(Error):
- """Unicode input error class. Raised on invalid input."""
-
-
-def _UInt(s):
- """Converts string to Unicode code point ('263A' => 0x263a).
-
- Args:
- s: string to convert
-
- Returns:
- Unicode code point
-
- Raises:
- InputError: the string is not a valid Unicode value.
- """
-
- try:
- v = int(s, 16)
- except ValueError:
- v = -1
- if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
- raise InputError("invalid Unicode value %s" % (s,))
- return v
-
-
-def _URange(s):
- """Converts string to Unicode range.
-
- '0001..0003' => [1, 2, 3].
- '0001' => [1].
-
- Args:
- s: string to convert
-
- Returns:
- Unicode range
-
- Raises:
- InputError: the string is not a valid Unicode range.
- """
- a = s.split("..")
- if len(a) == 1:
- return [_UInt(a[0])]
- if len(a) == 2:
- lo = _UInt(a[0])
- hi = _UInt(a[1])
- if lo < hi:
- return range(lo, hi + 1)
- raise InputError("invalid Unicode range %s" % (s,))
-
-
-def _UStr(v):
- """Converts Unicode code point to hex string.
-
- 0x263a => '0x263A'.
-
- Args:
- v: code point to convert
-
- Returns:
- Unicode string
-
- Raises:
- InputError: the argument is not a valid Unicode value.
- """
- if v < 0 or v > _RUNE_MAX:
- raise InputError("invalid Unicode value %s" % (v,))
- return "0x%04X" % (v,)
-
-
-def _ParseContinue(s):
- """Parses a Unicode continuation field.
-
- These are of the form '<Name, First>' or '<Name, Last>'.
- Instead of giving an explicit range in a single table entry,
- some Unicode tables use two entries, one for the first
- code value in the range and one for the last.
- The first entry's description is '<Name, First>' instead of 'Name'
- and the second is '<Name, Last>'.
-
- '<Name, First>' => ('Name', 'First')
- '<Name, Last>' => ('Name', 'Last')
- 'Anything else' => ('Anything else', None)
-
- Args:
- s: continuation field string
-
- Returns:
- pair: name and ('First', 'Last', or None)
- """
-
- match = re.match("<(.*), (First|Last)>", s)
- if match is not None:
- return match.groups()
- return (s, None)
-
-
-def ReadUnicodeTable(filename, nfields, doline):
- """Generic Unicode table text file reader.
-
- The reader takes care of stripping out comments and also
- parsing the two different ways that the Unicode tables specify
- code ranges (using the .. notation and splitting the range across
- multiple lines).
-
- Each non-comment line in the table is expected to have the given
- number of fields. The first field is known to be the Unicode value
- and the second field its description.
-
- The reader calls doline(codes, fields) for each entry in the table.
- If fn raises an exception, the reader prints that exception,
- prefixed with the file name and line number, and continues
- processing the file. When done with the file, the reader re-raises
- the first exception encountered during the file.
-
- Arguments:
- filename: the Unicode data file to read, or a file-like object.
- nfields: the number of expected fields per line in that file.
- doline: the function to call for each table entry.
-
- Raises:
- InputError: nfields is invalid (must be >= 2).
- """
-
- if nfields < 2:
- raise InputError("invalid number of fields %d" % (nfields,))
-
- if type(filename) == str:
- if filename.startswith("http://"):
- fil = urllib2.urlopen(filename)
- else:
- fil = open(filename, "r")
- else:
- fil = filename
-
- first = None # first code in multiline range
- expect_last = None # tag expected for "Last" line in multiline range
- lineno = 0 # current line number
- for line in fil:
- lineno += 1
- try:
- # Chop # comments and white space; ignore empty lines.
- sharp = line.find("#")
- if sharp >= 0:
- line = line[:sharp]
- line = line.strip()
- if not line:
- continue
-
- # Split fields on ";", chop more white space.
- # Must have the expected number of fields.
- fields = [s.strip() for s in line.split(";")]
- if len(fields) != nfields:
- raise InputError("wrong number of fields %d %d - %s" %
- (len(fields), nfields, line))
-
- # The Unicode text files have two different ways
- # to list a Unicode range. Either the first field is
- # itself a range (0000..FFFF), or the range is split
- # across two lines, with the second field noting
- # the continuation.
- codes = _URange(fields[0])
- (name, cont) = _ParseContinue(fields[1])
-
- if expect_last is not None:
- # If the last line gave the First code in a range,
- # this one had better give the Last one.
- if (len(codes) != 1 or codes[0] <= first or
- cont != "Last" or name != expect_last):
- raise InputError("expected Last line for %s" %
- (expect_last,))
- codes = range(first, codes[0] + 1)
- first = None
- expect_last = None
- fields[0] = "%04X..%04X" % (codes[0], codes[-1])
- fields[1] = name
- elif cont == "First":
- # Otherwise, if this is the First code in a range,
- # remember it and go to the next line.
- if len(codes) != 1:
- raise InputError("bad First line: range given")
- expect_last = name
- first = codes[0]
- continue
-
- doline(codes, fields)
-
- except Exception, e:
- print "%s:%d: %s" % (filename, lineno, e)
- raise
-
- if expect_last is not None:
- raise InputError("expected Last line for %s; got EOF" %
- (expect_last,))
-
-
-def CaseGroups(unicode_dir=_UNICODE_DIR):
- """Returns list of Unicode code groups equivalent under case folding.
-
- Each group is a sorted list of code points,
- and the list of groups is sorted by first code point
- in the group.
-
- Args:
- unicode_dir: Unicode data directory
-
- Returns:
- list of Unicode code groups
- """
-
- # Dict mapping lowercase code point to fold-equivalent group.
- togroup = {}
-
- def DoLine(codes, fields):
- """Process single CaseFolding.txt line, updating togroup."""
- (_, foldtype, lower, _) = fields
- if foldtype not in ("C", "S"):
- return
- lower = _UInt(lower)
- togroup.setdefault(lower, [lower]).extend(codes)
-
- ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
-
- groups = togroup.values()
- for g in groups:
- g.sort()
- groups.sort()
- return togroup, groups
-
-
-def Scripts(unicode_dir=_UNICODE_DIR):
- """Returns dict mapping script names to code lists.
-
- Args:
- unicode_dir: Unicode data directory
-
- Returns:
- dict mapping script names to code lists
- """
-
- scripts = {}
-
- def DoLine(codes, fields):
- """Process single Scripts.txt line, updating scripts."""
- (_, name) = fields
- scripts.setdefault(name, []).extend(codes)
-
- ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
- return scripts
-
-
-def Categories(unicode_dir=_UNICODE_DIR):
- """Returns dict mapping category names to code lists.
-
- Args:
- unicode_dir: Unicode data directory
-
- Returns:
- dict mapping category names to code lists
- """
-
- categories = {}
-
- def DoLine(codes, fields):
- """Process single UnicodeData.txt line, updating categories."""
- category = fields[2]
- categories.setdefault(category, []).extend(codes)
- # Add codes from Lu into L, etc.
- if len(category) > 1:
- short = category[0]
- categories.setdefault(short, []).extend(codes)
-
- ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
- return categories
-
« no previous file with comments | « third_party/re2/re2/tostring.cc ('k') | third_party/re2/re2/unicode_casefold.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698