Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(122)

Side by Side Diff: third_party/Python-Markdown/markdown/inlinepatterns.py

Issue 1389543003: Revert of Check in a simple pure-python based Markdown previewer. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@add
Patch Set: Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 """
2 INLINE PATTERNS
3 =============================================================================
4
5 Inline patterns such as *emphasis* are handled by means of auxiliary
6 objects, one per pattern. Pattern objects must be instances of classes
7 that extend markdown.Pattern. Each pattern object uses a single regular
8 expression and needs support the following methods:
9
10 pattern.getCompiledRegExp() # returns a regular expression
11
12 pattern.handleMatch(m) # takes a match object and returns
13 # an ElementTree element or just plain text
14
15 All of python markdown's built-in patterns subclass from Pattern,
16 but you can add additional patterns that don't.
17
18 Also note that all the regular expressions used by inline must
19 capture the whole block. For this reason, they all start with
20 '^(.*)' and end with '(.*)!'. In case with built-in expression
21 Pattern takes care of adding the "^(.*)" and "(.*)!".
22
23 Finally, the order in which regular expressions are applied is very
24 important - e.g. if we first replace http://.../ links with <a> tags
25 and _then_ try to replace inline html, we would end up with a mess.
26 So, we apply the expressions in the following order:
27
28 * escape and backticks have to go before everything else, so
29 that we can preempt any markdown patterns by escaping them.
30
31 * then we handle auto-links (must be done before inline html)
32
33 * then we handle inline HTML. At this point we will simply
34 replace all inline HTML strings with a placeholder and add
35 the actual HTML to a hash.
36
37 * then inline images (must be done before links)
38
39 * then bracketed links, first regular then reference-style
40
41 * finally we apply strong and emphasis
42 """
43
44 from __future__ import absolute_import
45 from __future__ import unicode_literals
46 from . import util
47 from . import odict
48 import re
49 try: # pragma: no cover
50 from urllib.parse import urlparse, urlunparse
51 except ImportError: # pragma: no cover
52 from urlparse import urlparse, urlunparse
53 try: # pragma: no cover
54 from html import entities
55 except ImportError: # pragma: no cover
56 import htmlentitydefs as entities
57
58
59 def build_inlinepatterns(md_instance, **kwargs):
60 """ Build the default set of inline patterns for Markdown. """
61 inlinePatterns = odict.OrderedDict()
62 inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
63 inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)
64 inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
65 inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
66 inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
67 inlinePatterns["image_reference"] = ImageReferencePattern(
68 IMAGE_REFERENCE_RE, md_instance
69 )
70 inlinePatterns["short_reference"] = ReferencePattern(
71 SHORT_REF_RE, md_instance
72 )
73 inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
74 inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
75 inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
76 if md_instance.safeMode != 'escape':
77 inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
78 inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
79 inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
80 inlinePatterns["em_strong"] = DoubleTagPattern(EM_STRONG_RE, 'strong,em')
81 inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'em,strong')
82 inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
83 inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
84 if md_instance.smart_emphasis:
85 inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
86 else:
87 inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
88 return inlinePatterns
89
90 """
91 The actual regular expressions for patterns
92 -----------------------------------------------------------------------------
93 """
94
95 NOBRACKET = r'[^\]\[]*'
96 BRK = (
97 r'\[(' +
98 (NOBRACKET + r'(\[')*6 +
99 (NOBRACKET + r'\])*')*6 +
100 NOBRACKET + r')\]'
101 )
102 NOIMG = r'(?<!\!)'
103
104 # `e=f()` or ``e=f("`")``
105 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)'
106
107 # \<
108 ESCAPE_RE = r'\\(.)'
109
110 # *emphasis*
111 EMPHASIS_RE = r'(\*)([^\*]+)\2'
112
113 # **strong**
114 STRONG_RE = r'(\*{2}|_{2})(.+?)\2'
115
116 # ***strongem*** or ***em*strong**
117 EM_STRONG_RE = r'(\*|_)\2{2}(.+?)\2(.*?)\2{2}'
118
119 # ***strong**em*
120 STRONG_EM_RE = r'(\*|_)\2{2}(.+?)\2{2}(.*?)\2'
121
122 # _smart_emphasis_
123 SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)'
124
125 # _emphasis_
126 EMPHASIS_2_RE = r'(_)(.+?)\2'
127
128 # [text](url) or [text](<url>) or [text](url "title")
129 LINK_RE = NOIMG + BRK + \
130 r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
131
132 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
133 IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^")]+"[^"]*"|[^\)]*))\)'
134
135 # [Google][3]
136 REFERENCE_RE = NOIMG + BRK + r'\s?\[([^\]]*)\]'
137
138 # [Google]
139 SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]'
140
141 # ![alt text][2]
142 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]'
143
144 # stand-alone * or _
145 NOT_STRONG_RE = r'((^| )(\*|_)( |$))'
146
147 # <http://www.123.com>
148 AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>'
149
150 # <me@example.com>
151 AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>'
152
153 # <...>
154 HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)'
155
156 # &amp;
157 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'
158
159 # two spaces at end of line
160 LINE_BREAK_RE = r' \n'
161
162
163 def dequote(string):
164 """Remove quotes from around a string."""
165 if ((string.startswith('"') and string.endswith('"')) or
166 (string.startswith("'") and string.endswith("'"))):
167 return string[1:-1]
168 else:
169 return string
170
171
172 ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
173
174
175 def handleAttributes(text, parent):
176 """Set values of an element based on attribute definitions ({@id=123})."""
177 def attributeCallback(match):
178 parent.set(match.group(1), match.group(2).replace('\n', ' '))
179 return ATTR_RE.sub(attributeCallback, text)
180
181
182 """
183 The pattern classes
184 -----------------------------------------------------------------------------
185 """
186
187
188 class Pattern(object):
189 """Base class that inline patterns subclass. """
190
191 def __init__(self, pattern, markdown_instance=None):
192 """
193 Create an instant of an inline pattern.
194
195 Keyword arguments:
196
197 * pattern: A regular expression that matches a pattern
198
199 """
200 self.pattern = pattern
201 self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern,
202 re.DOTALL | re.UNICODE)
203
204 # Api for Markdown to pass safe_mode into instance
205 self.safe_mode = False
206 if markdown_instance:
207 self.markdown = markdown_instance
208
209 def getCompiledRegExp(self):
210 """ Return a compiled regular expression. """
211 return self.compiled_re
212
213 def handleMatch(self, m):
214 """Return a ElementTree element from the given match.
215
216 Subclasses should override this method.
217
218 Keyword arguments:
219
220 * m: A re match object containing a match of the pattern.
221
222 """
223 pass # pragma: no cover
224
225 def type(self):
226 """ Return class name, to define pattern type """
227 return self.__class__.__name__
228
229 def unescape(self, text):
230 """ Return unescaped text given text with an inline placeholder. """
231 try:
232 stash = self.markdown.treeprocessors['inline'].stashed_nodes
233 except KeyError: # pragma: no cover
234 return text
235
236 def itertext(el): # pragma: no cover
237 ' Reimplement Element.itertext for older python versions '
238 tag = el.tag
239 if not isinstance(tag, util.string_type) and tag is not None:
240 return
241 if el.text:
242 yield el.text
243 for e in el:
244 for s in itertext(e):
245 yield s
246 if e.tail:
247 yield e.tail
248
249 def get_stash(m):
250 id = m.group(1)
251 if id in stash:
252 value = stash.get(id)
253 if isinstance(value, util.string_type):
254 return value
255 else:
256 # An etree Element - return text content only
257 return ''.join(itertext(value))
258 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
259
260
261 class SimpleTextPattern(Pattern):
262 """ Return a simple text of group(2) of a Pattern. """
263 def handleMatch(self, m):
264 return m.group(2)
265
266
267 class EscapePattern(Pattern):
268 """ Return an escaped character. """
269
270 def handleMatch(self, m):
271 char = m.group(2)
272 if char in self.markdown.ESCAPED_CHARS:
273 return '%s%s%s' % (util.STX, ord(char), util.ETX)
274 else:
275 return None
276
277
278 class SimpleTagPattern(Pattern):
279 """
280 Return element of type `tag` with a text attribute of group(3)
281 of a Pattern.
282
283 """
284 def __init__(self, pattern, tag):
285 Pattern.__init__(self, pattern)
286 self.tag = tag
287
288 def handleMatch(self, m):
289 el = util.etree.Element(self.tag)
290 el.text = m.group(3)
291 return el
292
293
294 class SubstituteTagPattern(SimpleTagPattern):
295 """ Return an element of type `tag` with no children. """
296 def handleMatch(self, m):
297 return util.etree.Element(self.tag)
298
299
300 class BacktickPattern(Pattern):
301 """ Return a `<code>` element containing the matching text. """
302 def __init__(self, pattern):
303 Pattern.__init__(self, pattern)
304 self.tag = "code"
305
306 def handleMatch(self, m):
307 el = util.etree.Element(self.tag)
308 el.text = util.AtomicString(m.group(3).strip())
309 return el
310
311
312 class DoubleTagPattern(SimpleTagPattern):
313 """Return a ElementTree element nested in tag2 nested in tag1.
314
315 Useful for strong emphasis etc.
316
317 """
318 def handleMatch(self, m):
319 tag1, tag2 = self.tag.split(",")
320 el1 = util.etree.Element(tag1)
321 el2 = util.etree.SubElement(el1, tag2)
322 el2.text = m.group(3)
323 if len(m.groups()) == 5:
324 el2.tail = m.group(4)
325 return el1
326
327
328 class HtmlPattern(Pattern):
329 """ Store raw inline html and return a placeholder. """
330 def handleMatch(self, m):
331 rawhtml = self.unescape(m.group(2))
332 place_holder = self.markdown.htmlStash.store(rawhtml)
333 return place_holder
334
335 def unescape(self, text):
336 """ Return unescaped text given text with an inline placeholder. """
337 try:
338 stash = self.markdown.treeprocessors['inline'].stashed_nodes
339 except KeyError: # pragma: no cover
340 return text
341
342 def get_stash(m):
343 id = m.group(1)
344 value = stash.get(id)
345 if value is not None:
346 try:
347 return self.markdown.serializer(value)
348 except:
349 return '\%s' % value
350
351 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
352
353
354 class LinkPattern(Pattern):
355 """ Return a link element from the given match. """
356 def handleMatch(self, m):
357 el = util.etree.Element("a")
358 el.text = m.group(2)
359 title = m.group(13)
360 href = m.group(9)
361
362 if href:
363 if href[0] == "<":
364 href = href[1:-1]
365 el.set("href", self.sanitize_url(self.unescape(href.strip())))
366 else:
367 el.set("href", "")
368
369 if title:
370 title = dequote(self.unescape(title))
371 el.set("title", title)
372 return el
373
374 def sanitize_url(self, url):
375 """
376 Sanitize a url against xss attacks in "safe_mode".
377
378 Rather than specifically blacklisting `javascript:alert("XSS")` and all
379 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
380 safe url formats. Most urls contain a network location, however some
381 are known not to (i.e.: mailto links). Script urls do not contain a
382 location. Additionally, for `javascript:...`, the scheme would be
383 "javascript" but some aliases will appear to `urlparse()` to have no
384 scheme. On top of that relative links (i.e.: "foo/bar.html") have no
385 scheme. Therefore we must check "path", "parameters", "query" and
386 "fragment" for any literal colons. We don't check "scheme" for colons
387 because it *should* never have any and "netloc" must allow the form:
388 `username:password@host:port`.
389
390 """
391 if not self.markdown.safeMode:
392 # Return immediately bipassing parsing.
393 return url
394
395 try:
396 scheme, netloc, path, params, query, fragment = url = urlparse(url)
397 except ValueError: # pragma: no cover
398 # Bad url - so bad it couldn't be parsed.
399 return ''
400
401 locless_schemes = ['', 'mailto', 'news']
402 allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']
403 if scheme not in allowed_schemes:
404 # Not a known (allowed) scheme. Not safe.
405 return ''
406
407 if netloc == '' and scheme not in locless_schemes: # pragma: no cover
408 # This should not happen. Treat as suspect.
409 return ''
410
411 for part in url[2:]:
412 if ":" in part:
413 # A colon in "path", "parameters", "query"
414 # or "fragment" is suspect.
415 return ''
416
417 # Url passes all tests. Return url as-is.
418 return urlunparse(url)
419
420
421 class ImagePattern(LinkPattern):
422 """ Return a img element from the given match. """
423 def handleMatch(self, m):
424 el = util.etree.Element("img")
425 src_parts = m.group(9).split()
426 if src_parts:
427 src = src_parts[0]
428 if src[0] == "<" and src[-1] == ">":
429 src = src[1:-1]
430 el.set('src', self.sanitize_url(self.unescape(src)))
431 else:
432 el.set('src', "")
433 if len(src_parts) > 1:
434 el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))
435
436 if self.markdown.enable_attributes:
437 truealt = handleAttributes(m.group(2), el)
438 else:
439 truealt = m.group(2)
440
441 el.set('alt', self.unescape(truealt))
442 return el
443
444
445 class ReferencePattern(LinkPattern):
446 """ Match to a stored reference and return link element. """
447
448 NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)
449
450 def handleMatch(self, m):
451 try:
452 id = m.group(9).lower()
453 except IndexError:
454 id = None
455 if not id:
456 # if we got something like "[Google][]" or "[Goggle]"
457 # we'll use "google" as the id
458 id = m.group(2).lower()
459
460 # Clean up linebreaks in id
461 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
462 if id not in self.markdown.references: # ignore undefined refs
463 return None
464 href, title = self.markdown.references[id]
465
466 text = m.group(2)
467 return self.makeTag(href, title, text)
468
469 def makeTag(self, href, title, text):
470 el = util.etree.Element('a')
471
472 el.set('href', self.sanitize_url(href))
473 if title:
474 el.set('title', title)
475
476 el.text = text
477 return el
478
479
480 class ImageReferencePattern(ReferencePattern):
481 """ Match to a stored reference and return img element. """
482 def makeTag(self, href, title, text):
483 el = util.etree.Element("img")
484 el.set("src", self.sanitize_url(href))
485 if title:
486 el.set("title", title)
487
488 if self.markdown.enable_attributes:
489 text = handleAttributes(text, el)
490
491 el.set("alt", self.unescape(text))
492 return el
493
494
495 class AutolinkPattern(Pattern):
496 """ Return a link Element given an autolink (`<http://example/com>`). """
497 def handleMatch(self, m):
498 el = util.etree.Element("a")
499 el.set('href', self.unescape(m.group(2)))
500 el.text = util.AtomicString(m.group(2))
501 return el
502
503
504 class AutomailPattern(Pattern):
505 """
506 Return a mailto link Element given an automail link (`<foo@example.com>`).
507 """
508 def handleMatch(self, m):
509 el = util.etree.Element('a')
510 email = self.unescape(m.group(2))
511 if email.startswith("mailto:"):
512 email = email[len("mailto:"):]
513
514 def codepoint2name(code):
515 """Return entity definition by code, or the code if not defined."""
516 entity = entities.codepoint2name.get(code)
517 if entity:
518 return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
519 else:
520 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
521
522 letters = [codepoint2name(ord(letter)) for letter in email]
523 el.text = util.AtomicString(''.join(letters))
524
525 mailto = "mailto:" + email
526 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
527 ord(letter) for letter in mailto])
528 el.set('href', mailto)
529 return el
OLDNEW
« no previous file with comments | « third_party/Python-Markdown/markdown/extensions/wikilinks.py ('k') | third_party/Python-Markdown/markdown/odict.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698