Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(600)

Side by Side Diff: third_party/markdown/inlinepatterns.py

Issue 93743005: Support markdown template for html editor (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: fix path without dir Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « third_party/markdown/extensions/wikilinks.py ('k') | third_party/markdown/odict.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 """
2 INLINE PATTERNS
3 =============================================================================
4
5 Inline patterns such as *emphasis* are handled by means of auxiliary
6 objects, one per pattern. Pattern objects must be instances of classes
7 that extend markdown.Pattern. Each pattern object uses a single regular
8 expression and needs support the following methods:
9
10 pattern.getCompiledRegExp() # returns a regular expression
11
12 pattern.handleMatch(m) # takes a match object and returns
13 # an ElementTree element or just plain text
14
15 All of python markdown's built-in patterns subclass from Pattern,
16 but you can add additional patterns that don't.
17
18 Also note that all the regular expressions used by inline must
19 capture the whole block. For this reason, they all start with
20 '^(.*)' and end with '(.*)!'. In case with built-in expression
21 Pattern takes care of adding the "^(.*)" and "(.*)!".
22
23 Finally, the order in which regular expressions are applied is very
24 important - e.g. if we first replace http://.../ links with <a> tags
25 and _then_ try to replace inline html, we would end up with a mess.
26 So, we apply the expressions in the following order:
27
28 * escape and backticks have to go before everything else, so
29 that we can preempt any markdown patterns by escaping them.
30
31 * then we handle auto-links (must be done before inline html)
32
33 * then we handle inline HTML. At this point we will simply
34 replace all inline HTML strings with a placeholder and add
35 the actual HTML to a hash.
36
37 * then inline images (must be done before links)
38
39 * then bracketed links, first regular then reference-style
40
41 * finally we apply strong and emphasis
42 """
43
44 from __future__ import absolute_import
45 from __future__ import unicode_literals
46 from . import util
47 from . import odict
48 import re
49 try:
50 from urllib.parse import urlparse, urlunparse
51 except ImportError:
52 from urlparse import urlparse, urlunparse
53 try:
54 from html import entities
55 except ImportError:
56 import htmlentitydefs as entities
57
58
59 def build_inlinepatterns(md_instance, **kwargs):
60 """ Build the default set of inline patterns for Markdown. """
61 inlinePatterns = odict.OrderedDict()
62 inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
63 inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)
64 inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
65 inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
66 inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
67 inlinePatterns["image_reference"] = \
68 ImageReferencePattern(IMAGE_REFERENCE_RE, md_instance)
69 inlinePatterns["short_reference"] = \
70 ReferencePattern(SHORT_REF_RE, md_instance)
71 inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
72 inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
73 inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
74 if md_instance.safeMode != 'escape':
75 inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
76 inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
77 inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
78 inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'strong,em')
79 inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
80 inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
81 if md_instance.smart_emphasis:
82 inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
83 else:
84 inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
85 return inlinePatterns
86
87 """
88 The actual regular expressions for patterns
89 -----------------------------------------------------------------------------
90 """
91
92 NOBRACKET = r'[^\]\[]*'
93 BRK = ( r'\[('
94 + (NOBRACKET + r'(\[')*6
95 + (NOBRACKET+ r'\])*')*6
96 + NOBRACKET + r')\]' )
97 NOIMG = r'(?<!\!)'
98
99 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``
100 ESCAPE_RE = r'\\(.)' # \<
101 EMPHASIS_RE = r'(\*)([^\*]+)\2' # *emphasis*
102 STRONG_RE = r'(\*{2}|_{2})(.+?)\2' # **strong**
103 STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2' # ***strong***
104 SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)' # _smart_emphasis_
105 EMPHASIS_2_RE = r'(_)(.+?)\2' # _emphasis_
106 LINK_RE = NOIMG + BRK + \
107 r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
108 # [text](url) or [text](<url>) or [text](url "title")
109
110 IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)'
111 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
112 REFERENCE_RE = NOIMG + BRK+ r'\s?\[([^\]]*)\]' # [Google][3]
113 SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]' # [Google]
114 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]' # ![alt text][2]
115 NOT_STRONG_RE = r'((^| )(\*|_)( |$))' # stand-alone * or _
116 AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>' # <http://www.123.co m>
117 AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # <me@example.com>
118
119 HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...>
120 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &amp;
121 LINE_BREAK_RE = r' \n' # two spaces at end of line
122
123
124 def dequote(string):
125 """Remove quotes from around a string."""
126 if ( ( string.startswith('"') and string.endswith('"'))
127 or (string.startswith("'") and string.endswith("'")) ):
128 return string[1:-1]
129 else:
130 return string
131
132 ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
133
134 def handleAttributes(text, parent):
135 """Set values of an element based on attribute definitions ({@id=123})."""
136 def attributeCallback(match):
137 parent.set(match.group(1), match.group(2).replace('\n', ' '))
138 return ATTR_RE.sub(attributeCallback, text)
139
140
141 """
142 The pattern classes
143 -----------------------------------------------------------------------------
144 """
145
146 class Pattern(object):
147 """Base class that inline patterns subclass. """
148
149 def __init__(self, pattern, markdown_instance=None):
150 """
151 Create an instant of an inline pattern.
152
153 Keyword arguments:
154
155 * pattern: A regular expression that matches a pattern
156
157 """
158 self.pattern = pattern
159 self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern,
160 re.DOTALL | re.UNICODE)
161
162 # Api for Markdown to pass safe_mode into instance
163 self.safe_mode = False
164 if markdown_instance:
165 self.markdown = markdown_instance
166
167 def getCompiledRegExp(self):
168 """ Return a compiled regular expression. """
169 return self.compiled_re
170
171 def handleMatch(self, m):
172 """Return a ElementTree element from the given match.
173
174 Subclasses should override this method.
175
176 Keyword arguments:
177
178 * m: A re match object containing a match of the pattern.
179
180 """
181 pass
182
183 def type(self):
184 """ Return class name, to define pattern type """
185 return self.__class__.__name__
186
187 def unescape(self, text):
188 """ Return unescaped text given text with an inline placeholder. """
189 try:
190 stash = self.markdown.treeprocessors['inline'].stashed_nodes
191 except KeyError:
192 return text
193 def itertext(el):
194 ' Reimplement Element.itertext for older python versions '
195 tag = el.tag
196 if not isinstance(tag, util.string_type) and tag is not None:
197 return
198 if el.text:
199 yield el.text
200 for e in el:
201 for s in itertext(e):
202 yield s
203 if e.tail:
204 yield e.tail
205 def get_stash(m):
206 id = m.group(1)
207 if id in stash:
208 value = stash.get(id)
209 if isinstance(value, util.string_type):
210 return value
211 else:
212 # An etree Element - return text content only
213 return ''.join(itertext(value))
214 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
215
216
217 class SimpleTextPattern(Pattern):
218 """ Return a simple text of group(2) of a Pattern. """
219 def handleMatch(self, m):
220 text = m.group(2)
221 if text == util.INLINE_PLACEHOLDER_PREFIX:
222 return None
223 return text
224
225
226 class EscapePattern(Pattern):
227 """ Return an escaped character. """
228
229 def handleMatch(self, m):
230 char = m.group(2)
231 if char in self.markdown.ESCAPED_CHARS:
232 return '%s%s%s' % (util.STX, ord(char), util.ETX)
233 else:
234 return '\\%s' % char
235
236
237 class SimpleTagPattern(Pattern):
238 """
239 Return element of type `tag` with a text attribute of group(3)
240 of a Pattern.
241
242 """
243 def __init__ (self, pattern, tag):
244 Pattern.__init__(self, pattern)
245 self.tag = tag
246
247 def handleMatch(self, m):
248 el = util.etree.Element(self.tag)
249 el.text = m.group(3)
250 return el
251
252
253 class SubstituteTagPattern(SimpleTagPattern):
254 """ Return an element of type `tag` with no children. """
255 def handleMatch (self, m):
256 return util.etree.Element(self.tag)
257
258
259 class BacktickPattern(Pattern):
260 """ Return a `<code>` element containing the matching text. """
261 def __init__ (self, pattern):
262 Pattern.__init__(self, pattern)
263 self.tag = "code"
264
265 def handleMatch(self, m):
266 el = util.etree.Element(self.tag)
267 el.text = util.AtomicString(m.group(3).strip())
268 return el
269
270
271 class DoubleTagPattern(SimpleTagPattern):
272 """Return a ElementTree element nested in tag2 nested in tag1.
273
274 Useful for strong emphasis etc.
275
276 """
277 def handleMatch(self, m):
278 tag1, tag2 = self.tag.split(",")
279 el1 = util.etree.Element(tag1)
280 el2 = util.etree.SubElement(el1, tag2)
281 el2.text = m.group(3)
282 return el1
283
284
285 class HtmlPattern(Pattern):
286 """ Store raw inline html and return a placeholder. """
287 def handleMatch (self, m):
288 rawhtml = self.unescape(m.group(2))
289 place_holder = self.markdown.htmlStash.store(rawhtml)
290 return place_holder
291
292 def unescape(self, text):
293 """ Return unescaped text given text with an inline placeholder. """
294 try:
295 stash = self.markdown.treeprocessors['inline'].stashed_nodes
296 except KeyError:
297 return text
298 def get_stash(m):
299 id = m.group(1)
300 value = stash.get(id)
301 if value is not None:
302 try:
303 return self.markdown.serializer(value)
304 except:
305 return '\%s' % value
306
307 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
308
309
310 class LinkPattern(Pattern):
311 """ Return a link element from the given match. """
312 def handleMatch(self, m):
313 el = util.etree.Element("a")
314 el.text = m.group(2)
315 title = m.group(13)
316 href = m.group(9)
317
318 if href:
319 if href[0] == "<":
320 href = href[1:-1]
321 el.set("href", self.sanitize_url(self.unescape(href.strip())))
322 else:
323 el.set("href", "")
324
325 if title:
326 title = dequote(self.unescape(title))
327 el.set("title", title)
328 return el
329
330 def sanitize_url(self, url):
331 """
332 Sanitize a url against xss attacks in "safe_mode".
333
334 Rather than specifically blacklisting `javascript:alert("XSS")` and all
335 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
336 safe url formats. Most urls contain a network location, however some
337 are known not to (i.e.: mailto links). Script urls do not contain a
338 location. Additionally, for `javascript:...`, the scheme would be
339 "javascript" but some aliases will appear to `urlparse()` to have no
340 scheme. On top of that relative links (i.e.: "foo/bar.html") have no
341 scheme. Therefore we must check "path", "parameters", "query" and
342 "fragment" for any literal colons. We don't check "scheme" for colons
343 because it *should* never have any and "netloc" must allow the form:
344 `username:password@host:port`.
345
346 """
347 url = url.replace(' ', '%20')
348 if not self.markdown.safeMode:
349 # Return immediately bipassing parsing.
350 return url
351
352 try:
353 scheme, netloc, path, params, query, fragment = url = urlparse(url)
354 except ValueError:
355 # Bad url - so bad it couldn't be parsed.
356 return ''
357
358 locless_schemes = ['', 'mailto', 'news']
359 allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']
360 if scheme not in allowed_schemes:
361 # Not a known (allowed) scheme. Not safe.
362 return ''
363
364 if netloc == '' and scheme not in locless_schemes:
365 # This should not happen. Treat as suspect.
366 return ''
367
368 for part in url[2:]:
369 if ":" in part:
370 # A colon in "path", "parameters", "query" or "fragment" is susp ect.
371 return ''
372
373 # Url passes all tests. Return url as-is.
374 return urlunparse(url)
375
376 class ImagePattern(LinkPattern):
377 """ Return a img element from the given match. """
378 def handleMatch(self, m):
379 el = util.etree.Element("img")
380 src_parts = m.group(9).split()
381 if src_parts:
382 src = src_parts[0]
383 if src[0] == "<" and src[-1] == ">":
384 src = src[1:-1]
385 el.set('src', self.sanitize_url(self.unescape(src)))
386 else:
387 el.set('src', "")
388 if len(src_parts) > 1:
389 el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))
390
391 if self.markdown.enable_attributes:
392 truealt = handleAttributes(m.group(2), el)
393 else:
394 truealt = m.group(2)
395
396 el.set('alt', self.unescape(truealt))
397 return el
398
399 class ReferencePattern(LinkPattern):
400 """ Match to a stored reference and return link element. """
401
402 NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)
403
404 def handleMatch(self, m):
405 try:
406 id = m.group(9).lower()
407 except IndexError:
408 id = None
409 if not id:
410 # if we got something like "[Google][]" or "[Goggle]"
411 # we'll use "google" as the id
412 id = m.group(2).lower()
413
414 # Clean up linebreaks in id
415 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
416 if not id in self.markdown.references: # ignore undefined refs
417 return None
418 href, title = self.markdown.references[id]
419
420 text = m.group(2)
421 return self.makeTag(href, title, text)
422
423 def makeTag(self, href, title, text):
424 el = util.etree.Element('a')
425
426 el.set('href', self.sanitize_url(href))
427 if title:
428 el.set('title', title)
429
430 el.text = text
431 return el
432
433
434 class ImageReferencePattern(ReferencePattern):
435 """ Match to a stored reference and return img element. """
436 def makeTag(self, href, title, text):
437 el = util.etree.Element("img")
438 el.set("src", self.sanitize_url(href))
439 if title:
440 el.set("title", title)
441
442 if self.markdown.enable_attributes:
443 text = handleAttributes(text, el)
444
445 el.set("alt", self.unescape(text))
446 return el
447
448
449 class AutolinkPattern(Pattern):
450 """ Return a link Element given an autolink (`<http://example/com>`). """
451 def handleMatch(self, m):
452 el = util.etree.Element("a")
453 el.set('href', self.unescape(m.group(2)))
454 el.text = util.AtomicString(m.group(2))
455 return el
456
457 class AutomailPattern(Pattern):
458 """
459 Return a mailto link Element given an automail link (`<foo@example.com>`).
460 """
461 def handleMatch(self, m):
462 el = util.etree.Element('a')
463 email = self.unescape(m.group(2))
464 if email.startswith("mailto:"):
465 email = email[len("mailto:"):]
466
467 def codepoint2name(code):
468 """Return entity definition by code, or the code if not defined."""
469 entity = entities.codepoint2name.get(code)
470 if entity:
471 return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
472 else:
473 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
474
475 letters = [codepoint2name(ord(letter)) for letter in email]
476 el.text = util.AtomicString(''.join(letters))
477
478 mailto = "mailto:" + email
479 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
480 ord(letter) for letter in mailto])
481 el.set('href', mailto)
482 return el
483
OLDNEW
« no previous file with comments | « third_party/markdown/extensions/wikilinks.py ('k') | third_party/markdown/odict.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698