Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(58)

Side by Side Diff: third_party/Python-Markdown/markdown/inlinepatterns.py

Issue 1356203004: Check in a simple pure-python based Markdown previewer. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@add
Patch Set: fix license file Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # markdown is released under the BSD license
2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
4 # Copyright 2004 Manfred Stienstra (the original version)
5 #
6 # All rights reserved.
7 #
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions are met:
10 #
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above copyright
14 # notice, this list of conditions and the following disclaimer in the
15 # documentation and/or other materials provided with the distribution.
16 # * Neither the name of the <organization> nor the
17 # names of its contributors may be used to endorse or promote products
18 # derived from this software without specific prior written permission.
19 #
20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 # POSSIBILITY OF SUCH DAMAGE.
31
32
33 """ 1 """
34 INLINE PATTERNS 2 INLINE PATTERNS
35 ============================================================================= 3 =============================================================================
36 4
37 Inline patterns such as *emphasis* are handled by means of auxiliary 5 Inline patterns such as *emphasis* are handled by means of auxiliary
38 objects, one per pattern. Pattern objects must be instances of classes 6 objects, one per pattern. Pattern objects must be instances of classes
39 that extend markdown.Pattern. Each pattern object uses a single regular 7 that extend markdown.Pattern. Each pattern object uses a single regular
40 expression and needs support the following methods: 8 expression and needs support the following methods:
41 9
42 pattern.getCompiledRegExp() # returns a regular expression 10 pattern.getCompiledRegExp() # returns a regular expression
(...skipping 28 matching lines...) Expand all
71 * then bracketed links, first regular then reference-style 39 * then bracketed links, first regular then reference-style
72 40
73 * finally we apply strong and emphasis 41 * finally we apply strong and emphasis
74 """ 42 """
75 43
76 from __future__ import absolute_import 44 from __future__ import absolute_import
77 from __future__ import unicode_literals 45 from __future__ import unicode_literals
78 from . import util 46 from . import util
79 from . import odict 47 from . import odict
80 import re 48 import re
81 try: 49 try: # pragma: no cover
82 from urllib.parse import urlparse, urlunparse 50 from urllib.parse import urlparse, urlunparse
83 except ImportError: 51 except ImportError: # pragma: no cover
84 from urlparse import urlparse, urlunparse 52 from urlparse import urlparse, urlunparse
85 try: 53 try: # pragma: no cover
86 from html import entities 54 from html import entities
87 except ImportError: 55 except ImportError: # pragma: no cover
88 import htmlentitydefs as entities 56 import htmlentitydefs as entities
89 57
90 58
91 def build_inlinepatterns(md_instance, **kwargs): 59 def build_inlinepatterns(md_instance, **kwargs):
92 """ Build the default set of inline patterns for Markdown. """ 60 """ Build the default set of inline patterns for Markdown. """
93 inlinePatterns = odict.OrderedDict() 61 inlinePatterns = odict.OrderedDict()
94 inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE) 62 inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
95 inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance) 63 inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)
96 inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance) 64 inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
97 inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance) 65 inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
98 inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance) 66 inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
99 inlinePatterns["image_reference"] = \ 67 inlinePatterns["image_reference"] = ImageReferencePattern(
100 ImageReferencePattern(IMAGE_REFERENCE_RE, md_instance) 68 IMAGE_REFERENCE_RE, md_instance
101 inlinePatterns["short_reference"] = \ 69 )
102 ReferencePattern(SHORT_REF_RE, md_instance) 70 inlinePatterns["short_reference"] = ReferencePattern(
71 SHORT_REF_RE, md_instance
72 )
103 inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance) 73 inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
104 inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance) 74 inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
105 inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br') 75 inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
106 if md_instance.safeMode != 'escape': 76 if md_instance.safeMode != 'escape':
107 inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance) 77 inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
108 inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance) 78 inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
109 inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE) 79 inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
110 inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'strong,em') 80 inlinePatterns["em_strong"] = DoubleTagPattern(EM_STRONG_RE, 'strong,em')
81 inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'em,strong')
111 inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong') 82 inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
112 inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em') 83 inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
113 if md_instance.smart_emphasis: 84 if md_instance.smart_emphasis:
114 inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em') 85 inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
115 else: 86 else:
116 inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em') 87 inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
117 return inlinePatterns 88 return inlinePatterns
118 89
119 """ 90 """
120 The actual regular expressions for patterns 91 The actual regular expressions for patterns
121 ----------------------------------------------------------------------------- 92 -----------------------------------------------------------------------------
122 """ 93 """
123 94
124 NOBRACKET = r'[^\]\[]*' 95 NOBRACKET = r'[^\]\[]*'
125 BRK = ( r'\[(' 96 BRK = (
126 + (NOBRACKET + r'(\[')*6 97 r'\[(' +
127 + (NOBRACKET+ r'\])*')*6 98 (NOBRACKET + r'(\[')*6 +
128 + NOBRACKET + r')\]' ) 99 (NOBRACKET + r'\])*')*6 +
100 NOBRACKET + r')\]'
101 )
129 NOIMG = r'(?<!\!)' 102 NOIMG = r'(?<!\!)'
130 103
131 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")`` 104 # `e=f()` or ``e=f("`")``
132 ESCAPE_RE = r'\\(.)' # \< 105 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)'
133 EMPHASIS_RE = r'(\*)([^\*]+)\2' # *emphasis* 106
134 STRONG_RE = r'(\*{2}|_{2})(.+?)\2' # **strong** 107 # \<
135 STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2' # ***strong*** 108 ESCAPE_RE = r'\\(.)'
136 SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)' # _smart_emphasis_ 109
137 EMPHASIS_2_RE = r'(_)(.+?)\2' # _emphasis_ 110 # *emphasis*
111 EMPHASIS_RE = r'(\*)([^\*]+)\2'
112
113 # **strong**
114 STRONG_RE = r'(\*{2}|_{2})(.+?)\2'
115
116 # ***strongem*** or ***em*strong**
117 EM_STRONG_RE = r'(\*|_)\2{2}(.+?)\2(.*?)\2{2}'
118
119 # ***strong**em*
120 STRONG_EM_RE = r'(\*|_)\2{2}(.+?)\2{2}(.*?)\2'
121
122 # _smart_emphasis_
123 SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)'
124
125 # _emphasis_
126 EMPHASIS_2_RE = r'(_)(.+?)\2'
127
128 # [text](url) or [text](<url>) or [text](url "title")
138 LINK_RE = NOIMG + BRK + \ 129 LINK_RE = NOIMG + BRK + \
139 r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)''' 130 r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
140 # [text](url) or [text](<url>) or [text](url "title")
141 131
142 IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)'
143 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>) 132 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
144 REFERENCE_RE = NOIMG + BRK+ r'\s?\[([^\]]*)\]' # [Google][3] 133 IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^")]+"[^"]*"|[^\)]*))\)'
145 SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]' # [Google]
146 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]' # ![alt text][2]
147 NOT_STRONG_RE = r'((^| )(\*|_)( |$))' # stand-alone * or _
148 AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>' # <http://www.123.co m>
149 AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # <me@example.com>
150 134
151 HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...> 135 # [Google][3]
152 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &amp; 136 REFERENCE_RE = NOIMG + BRK + r'\s?\[([^\]]*)\]'
153 LINE_BREAK_RE = r' \n' # two spaces at end of line 137
138 # [Google]
139 SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]'
140
141 # ![alt text][2]
142 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]'
143
144 # stand-alone * or _
145 NOT_STRONG_RE = r'((^| )(\*|_)( |$))'
146
147 # <http://www.123.com>
148 AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>'
149
150 # <me@example.com>
151 AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>'
152
153 # <...>
154 HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)'
155
156 # &amp;
157 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'
158
159 # two spaces at end of line
160 LINE_BREAK_RE = r' \n'
154 161
155 162
156 def dequote(string): 163 def dequote(string):
157 """Remove quotes from around a string.""" 164 """Remove quotes from around a string."""
158 if ( ( string.startswith('"') and string.endswith('"')) 165 if ((string.startswith('"') and string.endswith('"')) or
159 or (string.startswith("'") and string.endswith("'")) ): 166 (string.startswith("'") and string.endswith("'"))):
160 return string[1:-1] 167 return string[1:-1]
161 else: 168 else:
162 return string 169 return string
163 170
164 ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123} 171
172 ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
173
165 174
166 def handleAttributes(text, parent): 175 def handleAttributes(text, parent):
167 """Set values of an element based on attribute definitions ({@id=123}).""" 176 """Set values of an element based on attribute definitions ({@id=123})."""
168 def attributeCallback(match): 177 def attributeCallback(match):
169 parent.set(match.group(1), match.group(2).replace('\n', ' ')) 178 parent.set(match.group(1), match.group(2).replace('\n', ' '))
170 return ATTR_RE.sub(attributeCallback, text) 179 return ATTR_RE.sub(attributeCallback, text)
171 180
172 181
173 """ 182 """
174 The pattern classes 183 The pattern classes
175 ----------------------------------------------------------------------------- 184 -----------------------------------------------------------------------------
176 """ 185 """
177 186
187
178 class Pattern(object): 188 class Pattern(object):
179 """Base class that inline patterns subclass. """ 189 """Base class that inline patterns subclass. """
180 190
181 def __init__(self, pattern, markdown_instance=None): 191 def __init__(self, pattern, markdown_instance=None):
182 """ 192 """
183 Create an instant of an inline pattern. 193 Create an instant of an inline pattern.
184 194
185 Keyword arguments: 195 Keyword arguments:
186 196
187 * pattern: A regular expression that matches a pattern 197 * pattern: A regular expression that matches a pattern
188 198
189 """ 199 """
190 self.pattern = pattern 200 self.pattern = pattern
191 self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, 201 self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern,
192 re.DOTALL | re.UNICODE) 202 re.DOTALL | re.UNICODE)
193 203
194 # Api for Markdown to pass safe_mode into instance 204 # Api for Markdown to pass safe_mode into instance
195 self.safe_mode = False 205 self.safe_mode = False
196 if markdown_instance: 206 if markdown_instance:
197 self.markdown = markdown_instance 207 self.markdown = markdown_instance
198 208
199 def getCompiledRegExp(self): 209 def getCompiledRegExp(self):
200 """ Return a compiled regular expression. """ 210 """ Return a compiled regular expression. """
201 return self.compiled_re 211 return self.compiled_re
202 212
203 def handleMatch(self, m): 213 def handleMatch(self, m):
204 """Return a ElementTree element from the given match. 214 """Return a ElementTree element from the given match.
205 215
206 Subclasses should override this method. 216 Subclasses should override this method.
207 217
208 Keyword arguments: 218 Keyword arguments:
209 219
210 * m: A re match object containing a match of the pattern. 220 * m: A re match object containing a match of the pattern.
211 221
212 """ 222 """
213 pass 223 pass # pragma: no cover
214 224
215 def type(self): 225 def type(self):
216 """ Return class name, to define pattern type """ 226 """ Return class name, to define pattern type """
217 return self.__class__.__name__ 227 return self.__class__.__name__
218 228
219 def unescape(self, text): 229 def unescape(self, text):
220 """ Return unescaped text given text with an inline placeholder. """ 230 """ Return unescaped text given text with an inline placeholder. """
221 try: 231 try:
222 stash = self.markdown.treeprocessors['inline'].stashed_nodes 232 stash = self.markdown.treeprocessors['inline'].stashed_nodes
223 except KeyError: 233 except KeyError: # pragma: no cover
224 return text 234 return text
225 def itertext(el): 235
236 def itertext(el): # pragma: no cover
226 ' Reimplement Element.itertext for older python versions ' 237 ' Reimplement Element.itertext for older python versions '
227 tag = el.tag 238 tag = el.tag
228 if not isinstance(tag, util.string_type) and tag is not None: 239 if not isinstance(tag, util.string_type) and tag is not None:
229 return 240 return
230 if el.text: 241 if el.text:
231 yield el.text 242 yield el.text
232 for e in el: 243 for e in el:
233 for s in itertext(e): 244 for s in itertext(e):
234 yield s 245 yield s
235 if e.tail: 246 if e.tail:
236 yield e.tail 247 yield e.tail
248
237 def get_stash(m): 249 def get_stash(m):
238 id = m.group(1) 250 id = m.group(1)
239 if id in stash: 251 if id in stash:
240 value = stash.get(id) 252 value = stash.get(id)
241 if isinstance(value, util.string_type): 253 if isinstance(value, util.string_type):
242 return value 254 return value
243 else: 255 else:
244 # An etree Element - return text content only 256 # An etree Element - return text content only
245 return ''.join(itertext(value)) 257 return ''.join(itertext(value))
246 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 258 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
247 259
248 260
249 class SimpleTextPattern(Pattern): 261 class SimpleTextPattern(Pattern):
250 """ Return a simple text of group(2) of a Pattern. """ 262 """ Return a simple text of group(2) of a Pattern. """
251 def handleMatch(self, m): 263 def handleMatch(self, m):
252 text = m.group(2) 264 return m.group(2)
253 if text == util.INLINE_PLACEHOLDER_PREFIX:
254 return None
255 return text
256 265
257 266
258 class EscapePattern(Pattern): 267 class EscapePattern(Pattern):
259 """ Return an escaped character. """ 268 """ Return an escaped character. """
260 269
261 def handleMatch(self, m): 270 def handleMatch(self, m):
262 char = m.group(2) 271 char = m.group(2)
263 if char in self.markdown.ESCAPED_CHARS: 272 if char in self.markdown.ESCAPED_CHARS:
264 return '%s%s%s' % (util.STX, ord(char), util.ETX) 273 return '%s%s%s' % (util.STX, ord(char), util.ETX)
265 else: 274 else:
266 return '\\%s' % char 275 return None
267 276
268 277
269 class SimpleTagPattern(Pattern): 278 class SimpleTagPattern(Pattern):
270 """ 279 """
271 Return element of type `tag` with a text attribute of group(3) 280 Return element of type `tag` with a text attribute of group(3)
272 of a Pattern. 281 of a Pattern.
273 282
274 """ 283 """
275 def __init__ (self, pattern, tag): 284 def __init__(self, pattern, tag):
276 Pattern.__init__(self, pattern) 285 Pattern.__init__(self, pattern)
277 self.tag = tag 286 self.tag = tag
278 287
279 def handleMatch(self, m): 288 def handleMatch(self, m):
280 el = util.etree.Element(self.tag) 289 el = util.etree.Element(self.tag)
281 el.text = m.group(3) 290 el.text = m.group(3)
282 return el 291 return el
283 292
284 293
285 class SubstituteTagPattern(SimpleTagPattern): 294 class SubstituteTagPattern(SimpleTagPattern):
286 """ Return an element of type `tag` with no children. """ 295 """ Return an element of type `tag` with no children. """
287 def handleMatch (self, m): 296 def handleMatch(self, m):
288 return util.etree.Element(self.tag) 297 return util.etree.Element(self.tag)
289 298
290 299
291 class BacktickPattern(Pattern): 300 class BacktickPattern(Pattern):
292 """ Return a `<code>` element containing the matching text. """ 301 """ Return a `<code>` element containing the matching text. """
293 def __init__ (self, pattern): 302 def __init__(self, pattern):
294 Pattern.__init__(self, pattern) 303 Pattern.__init__(self, pattern)
295 self.tag = "code" 304 self.tag = "code"
296 305
297 def handleMatch(self, m): 306 def handleMatch(self, m):
298 el = util.etree.Element(self.tag) 307 el = util.etree.Element(self.tag)
299 el.text = util.AtomicString(m.group(3).strip()) 308 el.text = util.AtomicString(m.group(3).strip())
300 return el 309 return el
301 310
302 311
303 class DoubleTagPattern(SimpleTagPattern): 312 class DoubleTagPattern(SimpleTagPattern):
304 """Return a ElementTree element nested in tag2 nested in tag1. 313 """Return a ElementTree element nested in tag2 nested in tag1.
305 314
306 Useful for strong emphasis etc. 315 Useful for strong emphasis etc.
307 316
308 """ 317 """
309 def handleMatch(self, m): 318 def handleMatch(self, m):
310 tag1, tag2 = self.tag.split(",") 319 tag1, tag2 = self.tag.split(",")
311 el1 = util.etree.Element(tag1) 320 el1 = util.etree.Element(tag1)
312 el2 = util.etree.SubElement(el1, tag2) 321 el2 = util.etree.SubElement(el1, tag2)
313 el2.text = m.group(3) 322 el2.text = m.group(3)
323 if len(m.groups()) == 5:
324 el2.tail = m.group(4)
314 return el1 325 return el1
315 326
316 327
317 class HtmlPattern(Pattern): 328 class HtmlPattern(Pattern):
318 """ Store raw inline html and return a placeholder. """ 329 """ Store raw inline html and return a placeholder. """
319 def handleMatch (self, m): 330 def handleMatch(self, m):
320 rawhtml = self.unescape(m.group(2)) 331 rawhtml = self.unescape(m.group(2))
321 place_holder = self.markdown.htmlStash.store(rawhtml) 332 place_holder = self.markdown.htmlStash.store(rawhtml)
322 return place_holder 333 return place_holder
323 334
324 def unescape(self, text): 335 def unescape(self, text):
325 """ Return unescaped text given text with an inline placeholder. """ 336 """ Return unescaped text given text with an inline placeholder. """
326 try: 337 try:
327 stash = self.markdown.treeprocessors['inline'].stashed_nodes 338 stash = self.markdown.treeprocessors['inline'].stashed_nodes
328 except KeyError: 339 except KeyError: # pragma: no cover
329 return text 340 return text
341
330 def get_stash(m): 342 def get_stash(m):
331 id = m.group(1) 343 id = m.group(1)
332 value = stash.get(id) 344 value = stash.get(id)
333 if value is not None: 345 if value is not None:
334 try: 346 try:
335 return self.markdown.serializer(value) 347 return self.markdown.serializer(value)
336 except: 348 except:
337 return '\%s' % value 349 return '\%s' % value
338 350
339 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 351 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
340 352
341 353
342 class LinkPattern(Pattern): 354 class LinkPattern(Pattern):
343 """ Return a link element from the given match. """ 355 """ Return a link element from the given match. """
344 def handleMatch(self, m): 356 def handleMatch(self, m):
345 el = util.etree.Element("a") 357 el = util.etree.Element("a")
346 el.text = m.group(2) 358 el.text = m.group(2)
347 title = m.group(13) 359 title = m.group(13)
348 href = m.group(9) 360 href = m.group(9)
349 361
350 if href: 362 if href:
351 if href[0] == "<": 363 if href[0] == "<":
352 href = href[1:-1] 364 href = href[1:-1]
353 el.set("href", self.sanitize_url(self.unescape(href.strip()))) 365 el.set("href", self.sanitize_url(self.unescape(href.strip())))
354 else: 366 else:
355 el.set("href", "") 367 el.set("href", "")
356 368
357 if title: 369 if title:
358 title = dequote(self.unescape(title)) 370 title = dequote(self.unescape(title))
359 el.set("title", title) 371 el.set("title", title)
360 return el 372 return el
361 373
362 def sanitize_url(self, url): 374 def sanitize_url(self, url):
363 """ 375 """
364 Sanitize a url against xss attacks in "safe_mode". 376 Sanitize a url against xss attacks in "safe_mode".
365 377
366 Rather than specifically blacklisting `javascript:alert("XSS")` and all 378 Rather than specifically blacklisting `javascript:alert("XSS")` and all
367 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known 379 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
368 safe url formats. Most urls contain a network location, however some 380 safe url formats. Most urls contain a network location, however some
369 are known not to (i.e.: mailto links). Script urls do not contain a 381 are known not to (i.e.: mailto links). Script urls do not contain a
370 location. Additionally, for `javascript:...`, the scheme would be 382 location. Additionally, for `javascript:...`, the scheme would be
371 "javascript" but some aliases will appear to `urlparse()` to have no 383 "javascript" but some aliases will appear to `urlparse()` to have no
372 scheme. On top of that relative links (i.e.: "foo/bar.html") have no 384 scheme. On top of that relative links (i.e.: "foo/bar.html") have no
373 scheme. Therefore we must check "path", "parameters", "query" and 385 scheme. Therefore we must check "path", "parameters", "query" and
374 "fragment" for any literal colons. We don't check "scheme" for colons 386 "fragment" for any literal colons. We don't check "scheme" for colons
375 because it *should* never have any and "netloc" must allow the form: 387 because it *should* never have any and "netloc" must allow the form:
376 `username:password@host:port`. 388 `username:password@host:port`.
377 389
378 """ 390 """
379 url = url.replace(' ', '%20')
380 if not self.markdown.safeMode: 391 if not self.markdown.safeMode:
381 # Return immediately bipassing parsing. 392 # Return immediately bipassing parsing.
382 return url 393 return url
383 394
384 try: 395 try:
385 scheme, netloc, path, params, query, fragment = url = urlparse(url) 396 scheme, netloc, path, params, query, fragment = url = urlparse(url)
386 except ValueError: 397 except ValueError: # pragma: no cover
387 # Bad url - so bad it couldn't be parsed. 398 # Bad url - so bad it couldn't be parsed.
388 return '' 399 return ''
389 400
390 locless_schemes = ['', 'mailto', 'news'] 401 locless_schemes = ['', 'mailto', 'news']
391 allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps'] 402 allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']
392 if scheme not in allowed_schemes: 403 if scheme not in allowed_schemes:
393 # Not a known (allowed) scheme. Not safe. 404 # Not a known (allowed) scheme. Not safe.
394 return '' 405 return ''
395 406
396 if netloc == '' and scheme not in locless_schemes: 407 if netloc == '' and scheme not in locless_schemes: # pragma: no cover
397 # This should not happen. Treat as suspect. 408 # This should not happen. Treat as suspect.
398 return '' 409 return ''
399 410
400 for part in url[2:]: 411 for part in url[2:]:
401 if ":" in part: 412 if ":" in part:
402 # A colon in "path", "parameters", "query" or "fragment" is susp ect. 413 # A colon in "path", "parameters", "query"
414 # or "fragment" is suspect.
403 return '' 415 return ''
404 416
405 # Url passes all tests. Return url as-is. 417 # Url passes all tests. Return url as-is.
406 return urlunparse(url) 418 return urlunparse(url)
407 419
420
408 class ImagePattern(LinkPattern): 421 class ImagePattern(LinkPattern):
409 """ Return a img element from the given match. """ 422 """ Return a img element from the given match. """
410 def handleMatch(self, m): 423 def handleMatch(self, m):
411 el = util.etree.Element("img") 424 el = util.etree.Element("img")
412 src_parts = m.group(9).split() 425 src_parts = m.group(9).split()
413 if src_parts: 426 if src_parts:
414 src = src_parts[0] 427 src = src_parts[0]
415 if src[0] == "<" and src[-1] == ">": 428 if src[0] == "<" and src[-1] == ">":
416 src = src[1:-1] 429 src = src[1:-1]
417 el.set('src', self.sanitize_url(self.unescape(src))) 430 el.set('src', self.sanitize_url(self.unescape(src)))
418 else: 431 else:
419 el.set('src', "") 432 el.set('src', "")
420 if len(src_parts) > 1: 433 if len(src_parts) > 1:
421 el.set('title', dequote(self.unescape(" ".join(src_parts[1:])))) 434 el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))
422 435
423 if self.markdown.enable_attributes: 436 if self.markdown.enable_attributes:
424 truealt = handleAttributes(m.group(2), el) 437 truealt = handleAttributes(m.group(2), el)
425 else: 438 else:
426 truealt = m.group(2) 439 truealt = m.group(2)
427 440
428 el.set('alt', self.unescape(truealt)) 441 el.set('alt', self.unescape(truealt))
429 return el 442 return el
430 443
444
431 class ReferencePattern(LinkPattern): 445 class ReferencePattern(LinkPattern):
432 """ Match to a stored reference and return link element. """ 446 """ Match to a stored reference and return link element. """
433 447
434 NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE) 448 NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)
435 449
436 def handleMatch(self, m): 450 def handleMatch(self, m):
437 try: 451 try:
438 id = m.group(9).lower() 452 id = m.group(9).lower()
439 except IndexError: 453 except IndexError:
440 id = None 454 id = None
441 if not id: 455 if not id:
442 # if we got something like "[Google][]" or "[Goggle]" 456 # if we got something like "[Google][]" or "[Goggle]"
443 # we'll use "google" as the id 457 # we'll use "google" as the id
444 id = m.group(2).lower() 458 id = m.group(2).lower()
445 459
446 # Clean up linebreaks in id 460 # Clean up linebreaks in id
447 id = self.NEWLINE_CLEANUP_RE.sub(' ', id) 461 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
448 if not id in self.markdown.references: # ignore undefined refs 462 if id not in self.markdown.references: # ignore undefined refs
449 return None 463 return None
450 href, title = self.markdown.references[id] 464 href, title = self.markdown.references[id]
451 465
452 text = m.group(2) 466 text = m.group(2)
453 return self.makeTag(href, title, text) 467 return self.makeTag(href, title, text)
454 468
455 def makeTag(self, href, title, text): 469 def makeTag(self, href, title, text):
456 el = util.etree.Element('a') 470 el = util.etree.Element('a')
457 471
458 el.set('href', self.sanitize_url(href)) 472 el.set('href', self.sanitize_url(href))
(...skipping 20 matching lines...) Expand all
479 493
480 494
481 class AutolinkPattern(Pattern): 495 class AutolinkPattern(Pattern):
482 """ Return a link Element given an autolink (`<http://example/com>`). """ 496 """ Return a link Element given an autolink (`<http://example/com>`). """
483 def handleMatch(self, m): 497 def handleMatch(self, m):
484 el = util.etree.Element("a") 498 el = util.etree.Element("a")
485 el.set('href', self.unescape(m.group(2))) 499 el.set('href', self.unescape(m.group(2)))
486 el.text = util.AtomicString(m.group(2)) 500 el.text = util.AtomicString(m.group(2))
487 return el 501 return el
488 502
503
489 class AutomailPattern(Pattern): 504 class AutomailPattern(Pattern):
490 """ 505 """
491 Return a mailto link Element given an automail link (`<foo@example.com>`). 506 Return a mailto link Element given an automail link (`<foo@example.com>`).
492 """ 507 """
493 def handleMatch(self, m): 508 def handleMatch(self, m):
494 el = util.etree.Element('a') 509 el = util.etree.Element('a')
495 email = self.unescape(m.group(2)) 510 email = self.unescape(m.group(2))
496 if email.startswith("mailto:"): 511 if email.startswith("mailto:"):
497 email = email[len("mailto:"):] 512 email = email[len("mailto:"):]
498 513
499 def codepoint2name(code): 514 def codepoint2name(code):
500 """Return entity definition by code, or the code if not defined.""" 515 """Return entity definition by code, or the code if not defined."""
501 entity = entities.codepoint2name.get(code) 516 entity = entities.codepoint2name.get(code)
502 if entity: 517 if entity:
503 return "%s%s;" % (util.AMP_SUBSTITUTE, entity) 518 return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
504 else: 519 else:
505 return "%s#%d;" % (util.AMP_SUBSTITUTE, code) 520 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
506 521
507 letters = [codepoint2name(ord(letter)) for letter in email] 522 letters = [codepoint2name(ord(letter)) for letter in email]
508 el.text = util.AtomicString(''.join(letters)) 523 el.text = util.AtomicString(''.join(letters))
509 524
510 mailto = "mailto:" + email 525 mailto = "mailto:" + email
511 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' % 526 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
512 ord(letter) for letter in mailto]) 527 ord(letter) for letter in mailto])
513 el.set('href', mailto) 528 el.set('href', mailto)
514 return el 529 return el
515
OLDNEW
« no previous file with comments | « third_party/Python-Markdown/markdown/extensions/wikilinks.py ('k') | third_party/Python-Markdown/markdown/odict.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698