third_party/Python-Markdown/markdown/inlinepatterns.py - Issue 1389543003: Revert of Check in a simple pure-python based Markdown previewer.

Side by Side Diff: third_party/Python-Markdown/markdown/inlinepatterns.py

Issue 1389543003: Revert of Check in a simple pure-python based Markdown previewer. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@add

Patch Set: Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 """

2 INLINE PATTERNS

3 =============================================================================

4

5 Inline patterns such as emphasis are handled by means of auxiliary

6 objects, one per pattern. Pattern objects must be instances of classes

7 that extend markdown.Pattern. Each pattern object uses a single regular

8 expression and needs support the following methods:

9

10 pattern.getCompiledRegExp() # returns a regular expression

11

12 pattern.handleMatch(m) # takes a match object and returns

13 # an ElementTree element or just plain text

14

15 All of python markdown's built-in patterns subclass from Pattern,

16 but you can add additional patterns that don't.

17

18 Also note that all the regular expressions used by inline must

19 capture the whole block. For this reason, they all start with

20 '^(.)' and end with '(.)!'. In case with built-in expression

21 Pattern takes care of adding the "^(.)" and "(.)!".

22

23 Finally, the order in which regular expressions are applied is very

24 important - e.g. if we first replace http://.../ links with <a> tags

25 and _then_ try to replace inline html, we would end up with a mess.

26 So, we apply the expressions in the following order:

27

28 * escape and backticks have to go before everything else, so

29 that we can preempt any markdown patterns by escaping them.

30

31 * then we handle auto-links (must be done before inline html)

32

33 * then we handle inline HTML. At this point we will simply

34 replace all inline HTML strings with a placeholder and add

35 the actual HTML to a hash.

36

37 * then inline images (must be done before links)

38

39 * then bracketed links, first regular then reference-style

40

41 * finally we apply strong and emphasis

42 """

43

44 from __future__ import absolute_import

45 from __future__ import unicode_literals

46 from . import util

47 from . import odict

48 import re

49 try: # pragma: no cover

50 from urllib.parse import urlparse, urlunparse

51 except ImportError: # pragma: no cover

52 from urlparse import urlparse, urlunparse

53 try: # pragma: no cover

54 from html import entities

55 except ImportError: # pragma: no cover

56 import htmlentitydefs as entities

57

58

59 def build_inlinepatterns(md_instance, **kwargs):

60 """ Build the default set of inline patterns for Markdown. """

61 inlinePatterns = odict.OrderedDict()

62 inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)

63 inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)

64 inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)

65 inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)

66 inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)

67 inlinePatterns["image_reference"] = ImageReferencePattern(

68 IMAGE_REFERENCE_RE, md_instance

69 )

70 inlinePatterns["short_reference"] = ReferencePattern(

71 SHORT_REF_RE, md_instance

72 )

73 inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)

74 inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)

75 inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')

76 if md_instance.safeMode != 'escape':

77 inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)

78 inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)

79 inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)

80 inlinePatterns["em_strong"] = DoubleTagPattern(EM_STRONG_RE, 'strong,em')

81 inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'em,strong')

82 inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')

83 inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')

84 if md_instance.smart_emphasis:

85 inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')

86 else:

87 inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')

88 return inlinePatterns

89

90 """

91 The actual regular expressions for patterns

92 -----------------------------------------------------------------------------

93 """

94

95 NOBRACKET = r'[^\]\[]*'

96 BRK = (

97 r'\[(' +

98 (NOBRACKET + r'(\[')*6 +

99 (NOBRACKET + r'\])')6 +

100 NOBRACKET + r')\]'

101 )

102 NOIMG = r'(?<!\!)'

103

104 # `e=f()` or ``e=f("`")``

105 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)'

106

107 # \<

108 ESCAPE_RE = r'\\(.)'

109

110 # emphasis

111 EMPHASIS_RE = r'(\)([^\]+)\2'

112

113 # strong

114 STRONG_RE = r'(\*{2}\|_{2})(.+?)\2'

115

116 # *strongem* or **emstrong**

117 EM_STRONG_RE = r'(\\|_)\2{2}(.+?)\2(.?)\2{2}'

118

119 # *strongem*

120 STRONG_EM_RE = r'(\\|_)\2{2}(.+?)\2{2}(.?)\2'

121

122 # _smart_emphasis_

123 SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)'

124

125 # _emphasis_

126 EMPHASIS_2_RE = r'(_)(.+?)\2'

127

128 # [text](url) or [text](<url>) or [text](url "title")

129 LINK_RE = NOIMG + BRK + \

130 r'''$\s(<.?>\|((?:(?:\(.?$)\|[^]))?)\s((['"])(.?)\12\s*)?\)'''

131

132 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)

133 IMAGE_LINK_RE = r'\!' + BRK + r'\s$(<.?>\|([^")]+"[^"]"\|[^$]))\)'

134

135 # [Google][3]

136 REFERENCE_RE = NOIMG + BRK + r'\s?\[([^\]]*)\]'

137

138 # [Google]

139 SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]'

140

141 # ![alt text][2]

142 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]'

143

144 # stand-alone * or _

145 NOT_STRONG_RE = r'((^\| )(\*\|_)( \|$))'

146

147 # <http://www.123.com>

148 AUTOLINK_RE = r'<((?:[Ff]\|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>'

149

150 # <me@example.com>

151 AUTOMAIL_RE = r'<([^> \!]@[^> ])>'

152

153 # <...>

154 HTML_RE = r'(\<([a-zA-Z/][^\>]?\|\!--.?--)\>)'

155

156 # &

157 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'

158

159 # two spaces at end of line

160 LINE_BREAK_RE = r' \n'

161

162

163 def dequote(string):

164 """Remove quotes from around a string."""

165 if ((string.startswith('"') and string.endswith('"')) or

166 (string.startswith("'") and string.endswith("'"))):

167 return string[1:-1]

168 else:

169 return string

170

171

172 ATTR_RE = re.compile("\{@([^\}])=([^\}])}") # {@id=123}

173

174

175 def handleAttributes(text, parent):

176 """Set values of an element based on attribute definitions ({@id=123})."""

177 def attributeCallback(match):

178 parent.set(match.group(1), match.group(2).replace('\n', ' '))

179 return ATTR_RE.sub(attributeCallback, text)

180

181

182 """

183 The pattern classes

184 -----------------------------------------------------------------------------

185 """

186

187

188 class Pattern(object):

189 """Base class that inline patterns subclass. """

190

191 def __init__(self, pattern, markdown_instance=None):

192 """

193 Create an instant of an inline pattern.

194

195 Keyword arguments:

196

197 * pattern: A regular expression that matches a pattern

198

199 """

200 self.pattern = pattern

201 self.compiled_re = re.compile("^(.?)%s(.?)$" % pattern,

202 re.DOTALL \| re.UNICODE)

203

204 # Api for Markdown to pass safe_mode into instance

205 self.safe_mode = False

206 if markdown_instance:

207 self.markdown = markdown_instance

208

209 def getCompiledRegExp(self):

210 """ Return a compiled regular expression. """

211 return self.compiled_re

212

213 def handleMatch(self, m):

214 """Return a ElementTree element from the given match.

215

216 Subclasses should override this method.

217

218 Keyword arguments:

219

220 * m: A re match object containing a match of the pattern.

221

222 """

223 pass # pragma: no cover

224

225 def type(self):

226 """ Return class name, to define pattern type """

227 return self.__class__.__name__

228

229 def unescape(self, text):

230 """ Return unescaped text given text with an inline placeholder. """

231 try:

232 stash = self.markdown.treeprocessors['inline'].stashed_nodes

233 except KeyError: # pragma: no cover

234 return text

235

236 def itertext(el): # pragma: no cover

237 ' Reimplement Element.itertext for older python versions '

238 tag = el.tag

239 if not isinstance(tag, util.string_type) and tag is not None:

240 return

241 if el.text:

242 yield el.text

243 for e in el:

244 for s in itertext(e):

245 yield s

246 if e.tail:

247 yield e.tail

248

249 def get_stash(m):

250 id = m.group(1)

251 if id in stash:

252 value = stash.get(id)

253 if isinstance(value, util.string_type):

254 return value

255 else:

256 # An etree Element - return text content only

257 return ''.join(itertext(value))

258 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)

259

260

261 class SimpleTextPattern(Pattern):

262 """ Return a simple text of group(2) of a Pattern. """

263 def handleMatch(self, m):

264 return m.group(2)

265

266

267 class EscapePattern(Pattern):

268 """ Return an escaped character. """

269

270 def handleMatch(self, m):

271 char = m.group(2)

272 if char in self.markdown.ESCAPED_CHARS:

273 return '%s%s%s' % (util.STX, ord(char), util.ETX)

274 else:

275 return None

276

277

278 class SimpleTagPattern(Pattern):

279 """

280 Return element of type `tag` with a text attribute of group(3)

281 of a Pattern.

282

283 """

284 def __init__(self, pattern, tag):

285 Pattern.__init__(self, pattern)

286 self.tag = tag

287

288 def handleMatch(self, m):

289 el = util.etree.Element(self.tag)

290 el.text = m.group(3)

291 return el

292

293

294 class SubstituteTagPattern(SimpleTagPattern):

295 """ Return an element of type `tag` with no children. """

296 def handleMatch(self, m):

297 return util.etree.Element(self.tag)

298

299

300 class BacktickPattern(Pattern):

301 """ Return a `<code>` element containing the matching text. """

302 def __init__(self, pattern):

303 Pattern.__init__(self, pattern)

304 self.tag = "code"

305

306 def handleMatch(self, m):

307 el = util.etree.Element(self.tag)

308 el.text = util.AtomicString(m.group(3).strip())

309 return el

310

311

312 class DoubleTagPattern(SimpleTagPattern):

313 """Return a ElementTree element nested in tag2 nested in tag1.

314

315 Useful for strong emphasis etc.

316

317 """

318 def handleMatch(self, m):

319 tag1, tag2 = self.tag.split(",")

320 el1 = util.etree.Element(tag1)

321 el2 = util.etree.SubElement(el1, tag2)

322 el2.text = m.group(3)

323 if len(m.groups()) == 5:

324 el2.tail = m.group(4)

325 return el1

326

327

328 class HtmlPattern(Pattern):

329 """ Store raw inline html and return a placeholder. """

330 def handleMatch(self, m):

331 rawhtml = self.unescape(m.group(2))

332 place_holder = self.markdown.htmlStash.store(rawhtml)

333 return place_holder

334

335 def unescape(self, text):

336 """ Return unescaped text given text with an inline placeholder. """

337 try:

338 stash = self.markdown.treeprocessors['inline'].stashed_nodes

339 except KeyError: # pragma: no cover

340 return text

341

342 def get_stash(m):

343 id = m.group(1)

344 value = stash.get(id)

345 if value is not None:

346 try:

347 return self.markdown.serializer(value)

348 except:

349 return '\%s' % value

350

351 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)

352

353

354 class LinkPattern(Pattern):

355 """ Return a link element from the given match. """

356 def handleMatch(self, m):

357 el = util.etree.Element("a")

358 el.text = m.group(2)

359 title = m.group(13)

360 href = m.group(9)

361

362 if href:

363 if href[0] == "<":

364 href = href[1:-1]

365 el.set("href", self.sanitize_url(self.unescape(href.strip())))

366 else:

367 el.set("href", "")

368

369 if title:

370 title = dequote(self.unescape(title))

371 el.set("title", title)

372 return el

373

374 def sanitize_url(self, url):

375 """

376 Sanitize a url against xss attacks in "safe_mode".

377

378 Rather than specifically blacklisting `javascript:alert("XSS")` and all

379 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known

380 safe url formats. Most urls contain a network location, however some

381 are known not to (i.e.: mailto links). Script urls do not contain a

382 location. Additionally, for `javascript:...`, the scheme would be

383 "javascript" but some aliases will appear to `urlparse()` to have no

384 scheme. On top of that relative links (i.e.: "foo/bar.html") have no

385 scheme. Therefore we must check "path", "parameters", "query" and

386 "fragment" for any literal colons. We don't check "scheme" for colons

387 because it should never have any and "netloc" must allow the form:

388 `username:password@host:port`.

389

390 """

391 if not self.markdown.safeMode:

392 # Return immediately bipassing parsing.

393 return url

394

395 try:

396 scheme, netloc, path, params, query, fragment = url = urlparse(url)

397 except ValueError: # pragma: no cover

398 # Bad url - so bad it couldn't be parsed.

399 return ''

400

401 locless_schemes = ['', 'mailto', 'news']

402 allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']

403 if scheme not in allowed_schemes:

404 # Not a known (allowed) scheme. Not safe.

405 return ''

406

407 if netloc == '' and scheme not in locless_schemes: # pragma: no cover

408 # This should not happen. Treat as suspect.

409 return ''

410

411 for part in url[2:]:

412 if ":" in part:

413 # A colon in "path", "parameters", "query"

414 # or "fragment" is suspect.

415 return ''

416

417 # Url passes all tests. Return url as-is.

418 return urlunparse(url)

419

420

421 class ImagePattern(LinkPattern):

422 """ Return a img element from the given match. """

423 def handleMatch(self, m):

424 el = util.etree.Element("img")

425 src_parts = m.group(9).split()

426 if src_parts:

427 src = src_parts[0]

428 if src[0] == "<" and src[-1] == ">":

429 src = src[1:-1]

430 el.set('src', self.sanitize_url(self.unescape(src)))

431 else:

432 el.set('src', "")

433 if len(src_parts) > 1:

434 el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))

435

436 if self.markdown.enable_attributes:

437 truealt = handleAttributes(m.group(2), el)

438 else:

439 truealt = m.group(2)

440

441 el.set('alt', self.unescape(truealt))

442 return el

443

444

445 class ReferencePattern(LinkPattern):

446 """ Match to a stored reference and return link element. """

447

448 NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)

449

450 def handleMatch(self, m):

451 try:

452 id = m.group(9).lower()

453 except IndexError:

454 id = None

455 if not id:

456 # if we got something like "[Google][]" or "[Goggle]"

457 # we'll use "google" as the id

458 id = m.group(2).lower()

459

460 # Clean up linebreaks in id

461 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)

462 if id not in self.markdown.references: # ignore undefined refs

463 return None

464 href, title = self.markdown.references[id]

465

466 text = m.group(2)

467 return self.makeTag(href, title, text)

468

469 def makeTag(self, href, title, text):

470 el = util.etree.Element('a')

471

472 el.set('href', self.sanitize_url(href))

473 if title:

474 el.set('title', title)

475

476 el.text = text

477 return el

478

479

480 class ImageReferencePattern(ReferencePattern):

481 """ Match to a stored reference and return img element. """

482 def makeTag(self, href, title, text):

483 el = util.etree.Element("img")

484 el.set("src", self.sanitize_url(href))

485 if title:

486 el.set("title", title)

487

488 if self.markdown.enable_attributes:

489 text = handleAttributes(text, el)

490

491 el.set("alt", self.unescape(text))

492 return el

493

494

495 class AutolinkPattern(Pattern):

496 """ Return a link Element given an autolink (`<http://example/com>`). """

497 def handleMatch(self, m):

498 el = util.etree.Element("a")

499 el.set('href', self.unescape(m.group(2)))

500 el.text = util.AtomicString(m.group(2))

501 return el

502

503

504 class AutomailPattern(Pattern):

505 """

506 Return a mailto link Element given an automail link (`<foo@example.com>`).

507 """

508 def handleMatch(self, m):

509 el = util.etree.Element('a')

510 email = self.unescape(m.group(2))

511 if email.startswith("mailto:"):

512 email = email[len("mailto:"):]

513

514 def codepoint2name(code):

515 """Return entity definition by code, or the code if not defined."""

516 entity = entities.codepoint2name.get(code)

517 if entity:

518 return "%s%s;" % (util.AMP_SUBSTITUTE, entity)

519 else:

520 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)

521

522 letters = [codepoint2name(ord(letter)) for letter in email]

523 el.text = util.AtomicString(''.join(letters))

524

525 mailto = "mailto:" + email

526 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %

527 ord(letter) for letter in mailto])

528 el.set('href', mailto)

529 return el

OLD	NEW

« no previous file with comments | « third_party/Python-Markdown/markdown/extensions/wikilinks.py ('k') | third_party/Python-Markdown/markdown/odict.py » ('j') | no next file with comments »