third_party/re2/re2/unicode.py - Issue 1544433002: Replace RE2 import with a dependency

Side by Side Diff: third_party/re2/re2/unicode.py

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 4 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 # Copyright 2008 The RE2 Authors. All Rights Reserved.

2 # Use of this source code is governed by a BSD-style

3 # license that can be found in the LICENSE file.

4

5 """Parser for Unicode data files (as distributed by unicode.org)."""

6

7 import os

8 import re

9 import urllib2

10

11 # Directory or URL where Unicode tables reside.

12 _UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd"

13

14 # Largest valid Unicode code value.

15 _RUNE_MAX = 0x10FFFF

16

17

18 class Error(Exception):

19 """Unicode error base class."""

20

21

22 class InputError(Error):

23 """Unicode input error class. Raised on invalid input."""

24

25

26 def _UInt(s):

27 """Converts string to Unicode code point ('263A' => 0x263a).

28

29 Args:

30 s: string to convert

31

32 Returns:

33 Unicode code point

34

35 Raises:

36 InputError: the string is not a valid Unicode value.

37 """

38

39 try:

40 v = int(s, 16)

41 except ValueError:

42 v = -1

43 if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:

44 raise InputError("invalid Unicode value %s" % (s,))

45 return v

46

47

48 def _URange(s):

49 """Converts string to Unicode range.

50

51 '0001..0003' => [1, 2, 3].

52 '0001' => [1].

53

54 Args:

55 s: string to convert

56

57 Returns:

58 Unicode range

59

60 Raises:

61 InputError: the string is not a valid Unicode range.

62 """

63 a = s.split("..")

64 if len(a) == 1:

65 return [_UInt(a[0])]

66 if len(a) == 2:

67 lo = _UInt(a[0])

68 hi = _UInt(a[1])

69 if lo < hi:

70 return range(lo, hi + 1)

71 raise InputError("invalid Unicode range %s" % (s,))

72

73

74 def _UStr(v):

75 """Converts Unicode code point to hex string.

76

77 0x263a => '0x263A'.

78

79 Args:

80 v: code point to convert

81

82 Returns:

83 Unicode string

84

85 Raises:

86 InputError: the argument is not a valid Unicode value.

87 """

88 if v < 0 or v > _RUNE_MAX:

89 raise InputError("invalid Unicode value %s" % (v,))

90 return "0x%04X" % (v,)

91

92

93 def _ParseContinue(s):

94 """Parses a Unicode continuation field.

95

96 These are of the form '<Name, First>' or '<Name, Last>'.

97 Instead of giving an explicit range in a single table entry,

98 some Unicode tables use two entries, one for the first

99 code value in the range and one for the last.

100 The first entry's description is '<Name, First>' instead of 'Name'

101 and the second is '<Name, Last>'.

102

103 '<Name, First>' => ('Name', 'First')

104 '<Name, Last>' => ('Name', 'Last')

105 'Anything else' => ('Anything else', None)

106

107 Args:

108 s: continuation field string

109

110 Returns:

111 pair: name and ('First', 'Last', or None)

112 """

113

114 match = re.match("<(.*), (First\|Last)>", s)

115 if match is not None:

116 return match.groups()

117 return (s, None)

118

119

120 def ReadUnicodeTable(filename, nfields, doline):

121 """Generic Unicode table text file reader.

122

123 The reader takes care of stripping out comments and also

124 parsing the two different ways that the Unicode tables specify

125 code ranges (using the .. notation and splitting the range across

126 multiple lines).

127

128 Each non-comment line in the table is expected to have the given

129 number of fields. The first field is known to be the Unicode value

130 and the second field its description.

131

132 The reader calls doline(codes, fields) for each entry in the table.

133 If fn raises an exception, the reader prints that exception,

134 prefixed with the file name and line number, and continues

135 processing the file. When done with the file, the reader re-raises

136 the first exception encountered during the file.

137

138 Arguments:

139 filename: the Unicode data file to read, or a file-like object.

140 nfields: the number of expected fields per line in that file.

141 doline: the function to call for each table entry.

142

143 Raises:

144 InputError: nfields is invalid (must be >= 2).

145 """

146

147 if nfields < 2:

148 raise InputError("invalid number of fields %d" % (nfields,))

149

150 if type(filename) == str:

151 if filename.startswith("http://"):

152 fil = urllib2.urlopen(filename)

153 else:

154 fil = open(filename, "r")

155 else:

156 fil = filename

157

158 first = None # first code in multiline range

159 expect_last = None # tag expected for "Last" line in multiline range

160 lineno = 0 # current line number

161 for line in fil:

162 lineno += 1

163 try:

164 # Chop # comments and white space; ignore empty lines.

165 sharp = line.find("#")

166 if sharp >= 0:

167 line = line[:sharp]

168 line = line.strip()

169 if not line:

170 continue

171

172 # Split fields on ";", chop more white space.

173 # Must have the expected number of fields.

174 fields = [s.strip() for s in line.split(";")]

175 if len(fields) != nfields:

176 raise InputError("wrong number of fields %d %d - %s" %

177 (len(fields), nfields, line))

178

179 # The Unicode text files have two different ways

180 # to list a Unicode range. Either the first field is

181 # itself a range (0000..FFFF), or the range is split

182 # across two lines, with the second field noting

183 # the continuation.

184 codes = _URange(fields[0])

185 (name, cont) = _ParseContinue(fields[1])

186

187 if expect_last is not None:

188 # If the last line gave the First code in a range,

189 # this one had better give the Last one.

190 if (len(codes) != 1 or codes[0] <= first or

191 cont != "Last" or name != expect_last):

192 raise InputError("expected Last line for %s" %

193 (expect_last,))

194 codes = range(first, codes[0] + 1)

195 first = None

196 expect_last = None

197 fields[0] = "%04X..%04X" % (codes[0], codes[-1])

198 fields[1] = name

199 elif cont == "First":

200 # Otherwise, if this is the First code in a range,

201 # remember it and go to the next line.

202 if len(codes) != 1:

203 raise InputError("bad First line: range given")

204 expect_last = name

205 first = codes[0]

206 continue

207

208 doline(codes, fields)

209

210 except Exception, e:

211 print "%s:%d: %s" % (filename, lineno, e)

212 raise

213

214 if expect_last is not None:

215 raise InputError("expected Last line for %s; got EOF" %

216 (expect_last,))

217

218

219 def CaseGroups(unicode_dir=_UNICODE_DIR):

220 """Returns list of Unicode code groups equivalent under case folding.

221

222 Each group is a sorted list of code points,

223 and the list of groups is sorted by first code point

224 in the group.

225

226 Args:

227 unicode_dir: Unicode data directory

228

229 Returns:

230 list of Unicode code groups

231 """

232

233 # Dict mapping lowercase code point to fold-equivalent group.

234 togroup = {}

235

236 def DoLine(codes, fields):

237 """Process single CaseFolding.txt line, updating togroup."""

238 (_, foldtype, lower, _) = fields

239 if foldtype not in ("C", "S"):

240 return

241 lower = _UInt(lower)

242 togroup.setdefault(lower, [lower]).extend(codes)

243

244 ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)

245

246 groups = togroup.values()

247 for g in groups:

248 g.sort()

249 groups.sort()

250 return togroup, groups

251

252

253 def Scripts(unicode_dir=_UNICODE_DIR):

254 """Returns dict mapping script names to code lists.

255

256 Args:

257 unicode_dir: Unicode data directory

258

259 Returns:

260 dict mapping script names to code lists

261 """

262

263 scripts = {}

264

265 def DoLine(codes, fields):

266 """Process single Scripts.txt line, updating scripts."""

267 (_, name) = fields

268 scripts.setdefault(name, []).extend(codes)

269

270 ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)

271 return scripts

272

273

274 def Categories(unicode_dir=_UNICODE_DIR):

275 """Returns dict mapping category names to code lists.

276

277 Args:

278 unicode_dir: Unicode data directory

279

280 Returns:

281 dict mapping category names to code lists

282 """

283

284 categories = {}

285

286 def DoLine(codes, fields):

287 """Process single UnicodeData.txt line, updating categories."""

288 category = fields[2]

289 categories.setdefault(category, []).extend(codes)

290 # Add codes from Lu into L, etc.

291 if len(category) > 1:

292 short = category[0]

293 categories.setdefault(short, []).extend(codes)

294

295 ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)

296 return categories

297

OLD	NEW

« no previous file with comments | « third_party/re2/re2/tostring.cc ('k') | third_party/re2/re2/unicode_casefold.h » ('j') | no next file with comments »