trunk/src/third_party/markdown/inlinepatterns.py - Issue 132753002: Revert 243980 "Docserver: Support markdown for HTML content."

Side by Side Diff: trunk/src/third_party/markdown/inlinepatterns.py

Issue 132753002: Revert 243980 "Docserver: Support markdown for HTML content." (Closed) Base URL: svn://svn.chromium.org/chrome/

Patch Set: Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 """

2 INLINE PATTERNS

3 =============================================================================

4

5 Inline patterns such as emphasis are handled by means of auxiliary

6 objects, one per pattern. Pattern objects must be instances of classes

7 that extend markdown.Pattern. Each pattern object uses a single regular

8 expression and needs support the following methods:

9

10 pattern.getCompiledRegExp() # returns a regular expression

11

12 pattern.handleMatch(m) # takes a match object and returns

13 # an ElementTree element or just plain text

14

15 All of python markdown's built-in patterns subclass from Pattern,

16 but you can add additional patterns that don't.

17

18 Also note that all the regular expressions used by inline must

19 capture the whole block. For this reason, they all start with

20 '^(.)' and end with '(.)!'. In case with built-in expression

21 Pattern takes care of adding the "^(.)" and "(.)!".

22

23 Finally, the order in which regular expressions are applied is very

24 important - e.g. if we first replace http://.../ links with <a> tags

25 and _then_ try to replace inline html, we would end up with a mess.

26 So, we apply the expressions in the following order:

27

28 * escape and backticks have to go before everything else, so

29 that we can preempt any markdown patterns by escaping them.

30

31 * then we handle auto-links (must be done before inline html)

32

33 * then we handle inline HTML. At this point we will simply

34 replace all inline HTML strings with a placeholder and add

35 the actual HTML to a hash.

36

37 * then inline images (must be done before links)

38

39 * then bracketed links, first regular then reference-style

40

41 * finally we apply strong and emphasis

42 """

43

44 from __future__ import absolute_import

45 from __future__ import unicode_literals

46 from . import util

47 from . import odict

48 import re

49 try:

50 from urllib.parse import urlparse, urlunparse

51 except ImportError:

52 from urlparse import urlparse, urlunparse

53 try:

54 from html import entities

55 except ImportError:

56 import htmlentitydefs as entities

57

58

59 def build_inlinepatterns(md_instance, **kwargs):

60 """ Build the default set of inline patterns for Markdown. """

61 inlinePatterns = odict.OrderedDict()

62 inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)

63 inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)

64 inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)

65 inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)

66 inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)

67 inlinePatterns["image_reference"] = \

68 ImageReferencePattern(IMAGE_REFERENCE_RE, md_instance)

69 inlinePatterns["short_reference"] = \

70 ReferencePattern(SHORT_REF_RE, md_instance)

71 inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)

72 inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)

73 inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')

74 if md_instance.safeMode != 'escape':

75 inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)

76 inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)

77 inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)

78 inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'strong,em')

79 inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')

80 inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')

81 if md_instance.smart_emphasis:

82 inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')

83 else:

84 inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')

85 return inlinePatterns

86

87 """

88 The actual regular expressions for patterns

89 -----------------------------------------------------------------------------

90 """

91

92 NOBRACKET = r'[^\]\[]*'

93 BRK = ( r'\[('

94 + (NOBRACKET + r'(\[')*6

95 + (NOBRACKET+ r'\])')6

96 + NOBRACKET + r')\]' )

97 NOIMG = r'(?<!\!)'

98

99 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``

100 ESCAPE_RE = r'\\(.)' # \<

101 EMPHASIS_RE = r'(\)([^\]+)\2' # emphasis

102 STRONG_RE = r'(\{2}\|_{2})(.+?)\2' # strong*

103 STRONG_EM_RE = r'(\{3}\|_{3})(.+?)\2' # strong*

104 SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)' # _smart_emphasis_

105 EMPHASIS_2_RE = r'(_)(.+?)\2' # _emphasis_

106 LINK_RE = NOIMG + BRK + \

107 r'''$\s(<.?>\|((?:(?:\(.?$)\|[^]))?)\s((['"])(.?)\12\s*)?\)'''

108 # [text](url) or [text](<url>) or [text](url "title")

109

110 IMAGE_LINK_RE = r'\!' + BRK + r'\s$(<.?>\|([^$]*))\)'

111 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)

112 REFERENCE_RE = NOIMG + BRK+ r'\s?\[([^\]]*)\]' # [Google][3]

113 SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]' # [Google]

114 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]' # ![alt text][2]

115 NOT_STRONG_RE = r'((^\| )(\\|_)( \|$))' # stand-alone or _

116 AUTOLINK_RE = r'<((?:[Ff]\|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>' # <http://www.123.co m>

117 AUTOMAIL_RE = r'<([^> \!]@[^> ])>' # <me@example.com>

118

119 HTML_RE = r'(\<([a-zA-Z/][^\>]?\|\!--.?--)\>)' # <...>

120 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &

121 LINE_BREAK_RE = r' \n' # two spaces at end of line

122

123

124 def dequote(string):

125 """Remove quotes from around a string."""

126 if ( ( string.startswith('"') and string.endswith('"'))

127 or (string.startswith("'") and string.endswith("'")) ):

128 return string[1:-1]

129 else:

130 return string

131

132 ATTR_RE = re.compile("\{@([^\}])=([^\}])}") # {@id=123}

133

134 def handleAttributes(text, parent):

135 """Set values of an element based on attribute definitions ({@id=123})."""

136 def attributeCallback(match):

137 parent.set(match.group(1), match.group(2).replace('\n', ' '))

138 return ATTR_RE.sub(attributeCallback, text)

139

140

141 """

142 The pattern classes

143 -----------------------------------------------------------------------------

144 """

145

146 class Pattern(object):

147 """Base class that inline patterns subclass. """

148

149 def __init__(self, pattern, markdown_instance=None):

150 """

151 Create an instant of an inline pattern.

152

153 Keyword arguments:

154

155 * pattern: A regular expression that matches a pattern

156

157 """

158 self.pattern = pattern

159 self.compiled_re = re.compile("^(.?)%s(.?)$" % pattern,

160 re.DOTALL \| re.UNICODE)

161

162 # Api for Markdown to pass safe_mode into instance

163 self.safe_mode = False

164 if markdown_instance:

165 self.markdown = markdown_instance

166

167 def getCompiledRegExp(self):

168 """ Return a compiled regular expression. """

169 return self.compiled_re

170

171 def handleMatch(self, m):

172 """Return a ElementTree element from the given match.

173

174 Subclasses should override this method.

175

176 Keyword arguments:

177

178 * m: A re match object containing a match of the pattern.

179

180 """

181 pass

182

183 def type(self):

184 """ Return class name, to define pattern type """

185 return self.__class__.__name__

186

187 def unescape(self, text):

188 """ Return unescaped text given text with an inline placeholder. """

189 try:

190 stash = self.markdown.treeprocessors['inline'].stashed_nodes

191 except KeyError:

192 return text

193 def itertext(el):

194 ' Reimplement Element.itertext for older python versions '

195 tag = el.tag

196 if not isinstance(tag, util.string_type) and tag is not None:

197 return

198 if el.text:

199 yield el.text

200 for e in el:

201 for s in itertext(e):

202 yield s

203 if e.tail:

204 yield e.tail

205 def get_stash(m):

206 id = m.group(1)

207 if id in stash:

208 value = stash.get(id)

209 if isinstance(value, util.string_type):

210 return value

211 else:

212 # An etree Element - return text content only

213 return ''.join(itertext(value))

214 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)

215

216

217 class SimpleTextPattern(Pattern):

218 """ Return a simple text of group(2) of a Pattern. """

219 def handleMatch(self, m):

220 text = m.group(2)

221 if text == util.INLINE_PLACEHOLDER_PREFIX:

222 return None

223 return text

224

225

226 class EscapePattern(Pattern):

227 """ Return an escaped character. """

228

229 def handleMatch(self, m):

230 char = m.group(2)

231 if char in self.markdown.ESCAPED_CHARS:

232 return '%s%s%s' % (util.STX, ord(char), util.ETX)

233 else:

234 return '\\%s' % char

235

236

237 class SimpleTagPattern(Pattern):

238 """

239 Return element of type `tag` with a text attribute of group(3)

240 of a Pattern.

241

242 """

243 def __init__ (self, pattern, tag):

244 Pattern.__init__(self, pattern)

245 self.tag = tag

246

247 def handleMatch(self, m):

248 el = util.etree.Element(self.tag)

249 el.text = m.group(3)

250 return el

251

252

253 class SubstituteTagPattern(SimpleTagPattern):

254 """ Return an element of type `tag` with no children. """

255 def handleMatch (self, m):

256 return util.etree.Element(self.tag)

257

258

259 class BacktickPattern(Pattern):

260 """ Return a `<code>` element containing the matching text. """

261 def __init__ (self, pattern):

262 Pattern.__init__(self, pattern)

263 self.tag = "code"

264

265 def handleMatch(self, m):

266 el = util.etree.Element(self.tag)

267 el.text = util.AtomicString(m.group(3).strip())

268 return el

269

270

271 class DoubleTagPattern(SimpleTagPattern):

272 """Return a ElementTree element nested in tag2 nested in tag1.

273

274 Useful for strong emphasis etc.

275

276 """

277 def handleMatch(self, m):

278 tag1, tag2 = self.tag.split(",")

279 el1 = util.etree.Element(tag1)

280 el2 = util.etree.SubElement(el1, tag2)

281 el2.text = m.group(3)

282 return el1

283

284

285 class HtmlPattern(Pattern):

286 """ Store raw inline html and return a placeholder. """

287 def handleMatch (self, m):

288 rawhtml = self.unescape(m.group(2))

289 place_holder = self.markdown.htmlStash.store(rawhtml)

290 return place_holder

291

292 def unescape(self, text):

293 """ Return unescaped text given text with an inline placeholder. """

294 try:

295 stash = self.markdown.treeprocessors['inline'].stashed_nodes

296 except KeyError:

297 return text

298 def get_stash(m):

299 id = m.group(1)

300 value = stash.get(id)

301 if value is not None:

302 try:

303 return self.markdown.serializer(value)

304 except:

305 return '\%s' % value

306

307 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)

308

309

310 class LinkPattern(Pattern):

311 """ Return a link element from the given match. """

312 def handleMatch(self, m):

313 el = util.etree.Element("a")

314 el.text = m.group(2)

315 title = m.group(13)

316 href = m.group(9)

317

318 if href:

319 if href[0] == "<":

320 href = href[1:-1]

321 el.set("href", self.sanitize_url(self.unescape(href.strip())))

322 else:

323 el.set("href", "")

324

325 if title:

326 title = dequote(self.unescape(title))

327 el.set("title", title)

328 return el

329

330 def sanitize_url(self, url):

331 """

332 Sanitize a url against xss attacks in "safe_mode".

333

334 Rather than specifically blacklisting `javascript:alert("XSS")` and all

335 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known

336 safe url formats. Most urls contain a network location, however some

337 are known not to (i.e.: mailto links). Script urls do not contain a

338 location. Additionally, for `javascript:...`, the scheme would be

339 "javascript" but some aliases will appear to `urlparse()` to have no

340 scheme. On top of that relative links (i.e.: "foo/bar.html") have no

341 scheme. Therefore we must check "path", "parameters", "query" and

342 "fragment" for any literal colons. We don't check "scheme" for colons

343 because it should never have any and "netloc" must allow the form:

344 `username:password@host:port`.

345

346 """

347 url = url.replace(' ', '%20')

348 if not self.markdown.safeMode:

349 # Return immediately bipassing parsing.

350 return url

351

352 try:

353 scheme, netloc, path, params, query, fragment = url = urlparse(url)

354 except ValueError:

355 # Bad url - so bad it couldn't be parsed.

356 return ''

357

358 locless_schemes = ['', 'mailto', 'news']

359 allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']

360 if scheme not in allowed_schemes:

361 # Not a known (allowed) scheme. Not safe.

362 return ''

363

364 if netloc == '' and scheme not in locless_schemes:

365 # This should not happen. Treat as suspect.

366 return ''

367

368 for part in url[2:]:

369 if ":" in part:

370 # A colon in "path", "parameters", "query" or "fragment" is susp ect.

371 return ''

372

373 # Url passes all tests. Return url as-is.

374 return urlunparse(url)

375

376 class ImagePattern(LinkPattern):

377 """ Return a img element from the given match. """

378 def handleMatch(self, m):

379 el = util.etree.Element("img")

380 src_parts = m.group(9).split()

381 if src_parts:

382 src = src_parts[0]

383 if src[0] == "<" and src[-1] == ">":

384 src = src[1:-1]

385 el.set('src', self.sanitize_url(self.unescape(src)))

386 else:

387 el.set('src', "")

388 if len(src_parts) > 1:

389 el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))

390

391 if self.markdown.enable_attributes:

392 truealt = handleAttributes(m.group(2), el)

393 else:

394 truealt = m.group(2)

395

396 el.set('alt', self.unescape(truealt))

397 return el

398

399 class ReferencePattern(LinkPattern):

400 """ Match to a stored reference and return link element. """

401

402 NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)

403

404 def handleMatch(self, m):

405 try:

406 id = m.group(9).lower()

407 except IndexError:

408 id = None

409 if not id:

410 # if we got something like "[Google][]" or "[Goggle]"

411 # we'll use "google" as the id

412 id = m.group(2).lower()

413

414 # Clean up linebreaks in id

415 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)

416 if not id in self.markdown.references: # ignore undefined refs

417 return None

418 href, title = self.markdown.references[id]

419

420 text = m.group(2)

421 return self.makeTag(href, title, text)

422

423 def makeTag(self, href, title, text):

424 el = util.etree.Element('a')

425

426 el.set('href', self.sanitize_url(href))

427 if title:

428 el.set('title', title)

429

430 el.text = text

431 return el

432

433

434 class ImageReferencePattern(ReferencePattern):

435 """ Match to a stored reference and return img element. """

436 def makeTag(self, href, title, text):

437 el = util.etree.Element("img")

438 el.set("src", self.sanitize_url(href))

439 if title:

440 el.set("title", title)

441

442 if self.markdown.enable_attributes:

443 text = handleAttributes(text, el)

444

445 el.set("alt", self.unescape(text))

446 return el

447

448

449 class AutolinkPattern(Pattern):

450 """ Return a link Element given an autolink (`<http://example/com>`). """

451 def handleMatch(self, m):

452 el = util.etree.Element("a")

453 el.set('href', self.unescape(m.group(2)))

454 el.text = util.AtomicString(m.group(2))

455 return el

456

457 class AutomailPattern(Pattern):

458 """

459 Return a mailto link Element given an automail link (`<foo@example.com>`).

460 """

461 def handleMatch(self, m):

462 el = util.etree.Element('a')

463 email = self.unescape(m.group(2))

464 if email.startswith("mailto:"):

465 email = email[len("mailto:"):]

466

467 def codepoint2name(code):

468 """Return entity definition by code, or the code if not defined."""

469 entity = entities.codepoint2name.get(code)

470 if entity:

471 return "%s%s;" % (util.AMP_SUBSTITUTE, entity)

472 else:

473 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)

474

475 letters = [codepoint2name(ord(letter)) for letter in email]

476 el.text = util.AtomicString(''.join(letters))

477

478 mailto = "mailto:" + email

479 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %

480 ord(letter) for letter in mailto])

481 el.set('href', mailto)

482 return el

483

OLD	NEW

« no previous file with comments | « trunk/src/third_party/markdown/extensions/wikilinks.py ('k') | trunk/src/third_party/markdown/odict.py » ('j') | no next file with comments »