third_party/Python-Markdown/markdown/inlinepatterns.py - Issue 1356203004: Check in a simple pure-python based Markdown previewer.

Side by Side Diff: third_party/Python-Markdown/markdown/inlinepatterns.py

Issue 1356203004: Check in a simple pure-python based Markdown previewer. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@add

Patch Set: patch for review Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # markdown is released under the BSD license

2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)

3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)

4 # Copyright 2004 Manfred Stienstra (the original version)

5 #

6 # All rights reserved.

7 #

8 # Redistribution and use in source and binary forms, with or without

9 # modification, are permitted provided that the following conditions are met:

10 #

11 # * Redistributions of source code must retain the above copyright

12 # notice, this list of conditions and the following disclaimer.

13 # * Redistributions in binary form must reproduce the above copyright

14 # notice, this list of conditions and the following disclaimer in the

15 # documentation and/or other materials provided with the distribution.

16 # * Neither the name of the <organization> nor the

17 # names of its contributors may be used to endorse or promote products

18 # derived from this software without specific prior written permission.

19 #

20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY

21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT

24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

30 # POSSIBILITY OF SUCH DAMAGE.

31

32

33 """	1 """

34 INLINE PATTERNS	2 INLINE PATTERNS

35 =============================================================================	3 =============================================================================

36	4

37 Inline patterns such as emphasis are handled by means of auxiliary	5 Inline patterns such as emphasis are handled by means of auxiliary

38 objects, one per pattern. Pattern objects must be instances of classes	6 objects, one per pattern. Pattern objects must be instances of classes

39 that extend markdown.Pattern. Each pattern object uses a single regular	7 that extend markdown.Pattern. Each pattern object uses a single regular

40 expression and needs support the following methods:	8 expression and needs support the following methods:

41	9

42 pattern.getCompiledRegExp() # returns a regular expression	10 pattern.getCompiledRegExp() # returns a regular expression

(...skipping 28 matching lines...) Expand all Loading...
71 * then bracketed links, first regular then reference-style	39 * then bracketed links, first regular then reference-style

72	40

73 * finally we apply strong and emphasis	41 * finally we apply strong and emphasis

74 """	42 """

75	43

76 from __future__ import absolute_import	44 from __future__ import absolute_import

77 from __future__ import unicode_literals	45 from __future__ import unicode_literals

78 from . import util	46 from . import util

79 from . import odict	47 from . import odict

80 import re	48 import re

81 try:	49 try: # pragma: no cover

82 from urllib.parse import urlparse, urlunparse	50 from urllib.parse import urlparse, urlunparse

83 except ImportError:	51 except ImportError: # pragma: no cover

84 from urlparse import urlparse, urlunparse	52 from urlparse import urlparse, urlunparse

85 try:	53 try: # pragma: no cover

86 from html import entities	54 from html import entities

87 except ImportError:	55 except ImportError: # pragma: no cover

88 import htmlentitydefs as entities	56 import htmlentitydefs as entities

89	57

90	58

91 def build_inlinepatterns(md_instance, **kwargs):	59 def build_inlinepatterns(md_instance, **kwargs):

92 """ Build the default set of inline patterns for Markdown. """	60 """ Build the default set of inline patterns for Markdown. """

93 inlinePatterns = odict.OrderedDict()	61 inlinePatterns = odict.OrderedDict()

94 inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)	62 inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)

95 inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)	63 inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)

96 inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)	64 inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)

97 inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)	65 inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)

98 inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)	66 inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)

99 inlinePatterns["image_reference"] = \	67 inlinePatterns["image_reference"] = ImageReferencePattern(

100 ImageReferencePattern(IMAGE_REFERENCE_RE, md_instance)	68 IMAGE_REFERENCE_RE, md_instance

101 inlinePatterns["short_reference"] = \	69 )

102 ReferencePattern(SHORT_REF_RE, md_instance)	70 inlinePatterns["short_reference"] = ReferencePattern(

	71 SHORT_REF_RE, md_instance

	72 )

103 inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)	73 inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)

104 inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)	74 inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)

105 inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')	75 inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')

106 if md_instance.safeMode != 'escape':	76 if md_instance.safeMode != 'escape':

107 inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)	77 inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)

108 inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)	78 inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)

109 inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)	79 inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)

110 inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'strong,em')	80 inlinePatterns["em_strong"] = DoubleTagPattern(EM_STRONG_RE, 'strong,em')

	81 inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'em,strong')

111 inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')	82 inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')

112 inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')	83 inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')

113 if md_instance.smart_emphasis:	84 if md_instance.smart_emphasis:

114 inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')	85 inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')

115 else:	86 else:

116 inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')	87 inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')

117 return inlinePatterns	88 return inlinePatterns

118	89

119 """	90 """

120 The actual regular expressions for patterns	91 The actual regular expressions for patterns

121 -----------------------------------------------------------------------------	92 -----------------------------------------------------------------------------

122 """	93 """

123	94

124 NOBRACKET = r'[^\]\[]*'	95 NOBRACKET = r'[^\]\[]*'

125 BRK = ( r'\[('	96 BRK = (

126 + (NOBRACKET + r'(\[')*6	97 r'\[(' +

127 + (NOBRACKET+ r'\])')6	98 (NOBRACKET + r'(\[')*6 +

128 + NOBRACKET + r')\]' )	99 (NOBRACKET + r'\])')6 +

	100 NOBRACKET + r')\]'

	101 )

129 NOIMG = r'(?<!\!)'	102 NOIMG = r'(?<!\!)'

130	103

131 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``	104 # `e=f()` or ``e=f("`")``

132 ESCAPE_RE = r'\\(.)' # \<	105 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)'

133 EMPHASIS_RE = r'(\)([^\]+)\2' # emphasis	106

134 STRONG_RE = r'(\{2}\|_{2})(.+?)\2' # strong*	107 # \<

135 STRONG_EM_RE = r'(\{3}\|_{3})(.+?)\2' # strong*	108 ESCAPE_RE = r'\\(.)'

136 SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)' # _smart_emphasis_	109

137 EMPHASIS_2_RE = r'(_)(.+?)\2' # _emphasis_	110 # emphasis

	111 EMPHASIS_RE = r'(\)([^\]+)\2'

	112

	113 # strong

	114 STRONG_RE = r'(\*{2}\|_{2})(.+?)\2'

	115

	116 # *strongem* or **emstrong**

	117 EM_STRONG_RE = r'(\\|_)\2{2}(.+?)\2(.?)\2{2}'

	118

	119 # *strongem*

	120 STRONG_EM_RE = r'(\\|_)\2{2}(.+?)\2{2}(.?)\2'

	121

	122 # _smart_emphasis_

	123 SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)'

	124

	125 # _emphasis_

	126 EMPHASIS_2_RE = r'(_)(.+?)\2'

	127

	128 # [text](url) or [text](<url>) or [text](url "title")

138 LINK_RE = NOIMG + BRK + \	129 LINK_RE = NOIMG + BRK + \

139 r'''$\s(<.?>\|((?:(?:\(.?$)\|[^]))?)\s((['"])(.?)\12\s*)?\)'''	130 r'''$\s(<.?>\|((?:(?:\(.?$)\|[^]))?)\s((['"])(.?)\12\s*)?\)'''

140 # [text](url) or [text](<url>) or [text](url "title")

141	131

142 IMAGE_LINK_RE = r'\!' + BRK + r'\s$(<.?>\|([^$]*))\)'

143 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)	132 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)

144 REFERENCE_RE = NOIMG + BRK+ r'\s?\[([^\]]*)\]' # [Google][3]	133 IMAGE_LINK_RE = r'\!' + BRK + r'\s$(<.?>\|([^")]+"[^"]"\|[^$]))\)'

145 SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]' # [Google]

146 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]' # ![alt text][2]

147 NOT_STRONG_RE = r'((^\| )(\\|_)( \|$))' # stand-alone or _

148 AUTOLINK_RE = r'<((?:[Ff]\|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>' # <http://www.123.co m>

149 AUTOMAIL_RE = r'<([^> \!]@[^> ])>' # <me@example.com>

150	134

151 HTML_RE = r'(\<([a-zA-Z/][^\>]?\|\!--.?--)\>)' # <...>	135 # [Google][3]

152 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &	136 REFERENCE_RE = NOIMG + BRK + r'\s?\[([^\]]*)\]'

153 LINE_BREAK_RE = r' \n' # two spaces at end of line	137

	138 # [Google]

	139 SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]'

	140

	141 # ![alt text][2]

	142 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]'

	143

	144 # stand-alone * or _

	145 NOT_STRONG_RE = r'((^\| )(\*\|_)( \|$))'

	146

	147 # <http://www.123.com>

	148 AUTOLINK_RE = r'<((?:[Ff]\|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>'

	149

	150 # <me@example.com>

	151 AUTOMAIL_RE = r'<([^> \!]@[^> ])>'

	152

	153 # <...>

	154 HTML_RE = r'(\<([a-zA-Z/][^\>]?\|\!--.?--)\>)'

	155

	156 # &

	157 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'

	158

	159 # two spaces at end of line

	160 LINE_BREAK_RE = r' \n'

154	161

155	162

156 def dequote(string):	163 def dequote(string):

157 """Remove quotes from around a string."""	164 """Remove quotes from around a string."""

158 if ( ( string.startswith('"') and string.endswith('"'))	165 if ((string.startswith('"') and string.endswith('"')) or

159 or (string.startswith("'") and string.endswith("'")) ):	166 (string.startswith("'") and string.endswith("'"))):

160 return string[1:-1]	167 return string[1:-1]

161 else:	168 else:

162 return string	169 return string

163	170

164 ATTR_RE = re.compile("\{@([^\}])=([^\}])}") # {@id=123}	171

	172 ATTR_RE = re.compile("\{@([^\}])=([^\}])}") # {@id=123}

	173

165	174

166 def handleAttributes(text, parent):	175 def handleAttributes(text, parent):

167 """Set values of an element based on attribute definitions ({@id=123})."""	176 """Set values of an element based on attribute definitions ({@id=123})."""

168 def attributeCallback(match):	177 def attributeCallback(match):

169 parent.set(match.group(1), match.group(2).replace('\n', ' '))	178 parent.set(match.group(1), match.group(2).replace('\n', ' '))

170 return ATTR_RE.sub(attributeCallback, text)	179 return ATTR_RE.sub(attributeCallback, text)

171	180

172	181

173 """	182 """

174 The pattern classes	183 The pattern classes

175 -----------------------------------------------------------------------------	184 -----------------------------------------------------------------------------

176 """	185 """

177	186

	187

178 class Pattern(object):	188 class Pattern(object):

179 """Base class that inline patterns subclass. """	189 """Base class that inline patterns subclass. """

180	190

181 def __init__(self, pattern, markdown_instance=None):	191 def __init__(self, pattern, markdown_instance=None):

182 """	192 """

183 Create an instant of an inline pattern.	193 Create an instant of an inline pattern.

184	194

185 Keyword arguments:	195 Keyword arguments:

186	196

187 * pattern: A regular expression that matches a pattern	197 * pattern: A regular expression that matches a pattern

188	198

189 """	199 """

190 self.pattern = pattern	200 self.pattern = pattern

191 self.compiled_re = re.compile("^(.?)%s(.?)$" % pattern,	201 self.compiled_re = re.compile("^(.?)%s(.?)$" % pattern,

192 re.DOTALL \| re.UNICODE)	202 re.DOTALL \| re.UNICODE)

193	203

194 # Api for Markdown to pass safe_mode into instance	204 # Api for Markdown to pass safe_mode into instance

195 self.safe_mode = False	205 self.safe_mode = False

196 if markdown_instance:	206 if markdown_instance:

197 self.markdown = markdown_instance	207 self.markdown = markdown_instance

198	208

199 def getCompiledRegExp(self):	209 def getCompiledRegExp(self):

200 """ Return a compiled regular expression. """	210 """ Return a compiled regular expression. """

201 return self.compiled_re	211 return self.compiled_re

202	212

203 def handleMatch(self, m):	213 def handleMatch(self, m):

204 """Return a ElementTree element from the given match.	214 """Return a ElementTree element from the given match.

205	215

206 Subclasses should override this method.	216 Subclasses should override this method.

207	217

208 Keyword arguments:	218 Keyword arguments:

209	219

210 * m: A re match object containing a match of the pattern.	220 * m: A re match object containing a match of the pattern.

211	221

212 """	222 """

213 pass	223 pass # pragma: no cover

214	224

215 def type(self):	225 def type(self):

216 """ Return class name, to define pattern type """	226 """ Return class name, to define pattern type """

217 return self.__class__.__name__	227 return self.__class__.__name__

218	228

219 def unescape(self, text):	229 def unescape(self, text):

220 """ Return unescaped text given text with an inline placeholder. """	230 """ Return unescaped text given text with an inline placeholder. """

221 try:	231 try:

222 stash = self.markdown.treeprocessors['inline'].stashed_nodes	232 stash = self.markdown.treeprocessors['inline'].stashed_nodes

223 except KeyError:	233 except KeyError: # pragma: no cover

224 return text	234 return text

225 def itertext(el):	235

	236 def itertext(el): # pragma: no cover

226 ' Reimplement Element.itertext for older python versions '	237 ' Reimplement Element.itertext for older python versions '

227 tag = el.tag	238 tag = el.tag

228 if not isinstance(tag, util.string_type) and tag is not None:	239 if not isinstance(tag, util.string_type) and tag is not None:

229 return	240 return

230 if el.text:	241 if el.text:

231 yield el.text	242 yield el.text

232 for e in el:	243 for e in el:

233 for s in itertext(e):	244 for s in itertext(e):

234 yield s	245 yield s

235 if e.tail:	246 if e.tail:

236 yield e.tail	247 yield e.tail

	248

237 def get_stash(m):	249 def get_stash(m):

238 id = m.group(1)	250 id = m.group(1)

239 if id in stash:	251 if id in stash:

240 value = stash.get(id)	252 value = stash.get(id)

241 if isinstance(value, util.string_type):	253 if isinstance(value, util.string_type):

242 return value	254 return value

243 else:	255 else:

244 # An etree Element - return text content only	256 # An etree Element - return text content only

245 return ''.join(itertext(value))	257 return ''.join(itertext(value))

246 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)	258 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)

247	259

248	260

249 class SimpleTextPattern(Pattern):	261 class SimpleTextPattern(Pattern):

250 """ Return a simple text of group(2) of a Pattern. """	262 """ Return a simple text of group(2) of a Pattern. """

251 def handleMatch(self, m):	263 def handleMatch(self, m):

252 text = m.group(2)	264 return m.group(2)

253 if text == util.INLINE_PLACEHOLDER_PREFIX:

254 return None

255 return text

256	265

257	266

258 class EscapePattern(Pattern):	267 class EscapePattern(Pattern):

259 """ Return an escaped character. """	268 """ Return an escaped character. """

260	269

261 def handleMatch(self, m):	270 def handleMatch(self, m):

262 char = m.group(2)	271 char = m.group(2)

263 if char in self.markdown.ESCAPED_CHARS:	272 if char in self.markdown.ESCAPED_CHARS:

264 return '%s%s%s' % (util.STX, ord(char), util.ETX)	273 return '%s%s%s' % (util.STX, ord(char), util.ETX)

265 else:	274 else:

266 return '\\%s' % char	275 return None

267	276

268	277

269 class SimpleTagPattern(Pattern):	278 class SimpleTagPattern(Pattern):

270 """	279 """

271 Return element of type `tag` with a text attribute of group(3)	280 Return element of type `tag` with a text attribute of group(3)

272 of a Pattern.	281 of a Pattern.

273	282

274 """	283 """

275 def __init__ (self, pattern, tag):	284 def __init__(self, pattern, tag):

276 Pattern.__init__(self, pattern)	285 Pattern.__init__(self, pattern)

277 self.tag = tag	286 self.tag = tag

278	287

279 def handleMatch(self, m):	288 def handleMatch(self, m):

280 el = util.etree.Element(self.tag)	289 el = util.etree.Element(self.tag)

281 el.text = m.group(3)	290 el.text = m.group(3)

282 return el	291 return el

283	292

284	293

285 class SubstituteTagPattern(SimpleTagPattern):	294 class SubstituteTagPattern(SimpleTagPattern):

286 """ Return an element of type `tag` with no children. """	295 """ Return an element of type `tag` with no children. """

287 def handleMatch (self, m):	296 def handleMatch(self, m):

288 return util.etree.Element(self.tag)	297 return util.etree.Element(self.tag)

289	298

290	299

291 class BacktickPattern(Pattern):	300 class BacktickPattern(Pattern):

292 """ Return a `<code>` element containing the matching text. """	301 """ Return a `<code>` element containing the matching text. """

293 def __init__ (self, pattern):	302 def __init__(self, pattern):

294 Pattern.__init__(self, pattern)	303 Pattern.__init__(self, pattern)

295 self.tag = "code"	304 self.tag = "code"

296	305

297 def handleMatch(self, m):	306 def handleMatch(self, m):

298 el = util.etree.Element(self.tag)	307 el = util.etree.Element(self.tag)

299 el.text = util.AtomicString(m.group(3).strip())	308 el.text = util.AtomicString(m.group(3).strip())

300 return el	309 return el

301	310

302	311

303 class DoubleTagPattern(SimpleTagPattern):	312 class DoubleTagPattern(SimpleTagPattern):

304 """Return a ElementTree element nested in tag2 nested in tag1.	313 """Return a ElementTree element nested in tag2 nested in tag1.

305	314

306 Useful for strong emphasis etc.	315 Useful for strong emphasis etc.

307	316

308 """	317 """

309 def handleMatch(self, m):	318 def handleMatch(self, m):

310 tag1, tag2 = self.tag.split(",")	319 tag1, tag2 = self.tag.split(",")

311 el1 = util.etree.Element(tag1)	320 el1 = util.etree.Element(tag1)

312 el2 = util.etree.SubElement(el1, tag2)	321 el2 = util.etree.SubElement(el1, tag2)

313 el2.text = m.group(3)	322 el2.text = m.group(3)

	323 if len(m.groups()) == 5:

	324 el2.tail = m.group(4)

314 return el1	325 return el1

315	326

316	327

317 class HtmlPattern(Pattern):	328 class HtmlPattern(Pattern):

318 """ Store raw inline html and return a placeholder. """	329 """ Store raw inline html and return a placeholder. """

319 def handleMatch (self, m):	330 def handleMatch(self, m):

320 rawhtml = self.unescape(m.group(2))	331 rawhtml = self.unescape(m.group(2))

321 place_holder = self.markdown.htmlStash.store(rawhtml)	332 place_holder = self.markdown.htmlStash.store(rawhtml)

322 return place_holder	333 return place_holder

323	334

324 def unescape(self, text):	335 def unescape(self, text):

325 """ Return unescaped text given text with an inline placeholder. """	336 """ Return unescaped text given text with an inline placeholder. """

326 try:	337 try:

327 stash = self.markdown.treeprocessors['inline'].stashed_nodes	338 stash = self.markdown.treeprocessors['inline'].stashed_nodes

328 except KeyError:	339 except KeyError: # pragma: no cover

329 return text	340 return text

	341

330 def get_stash(m):	342 def get_stash(m):

331 id = m.group(1)	343 id = m.group(1)

332 value = stash.get(id)	344 value = stash.get(id)

333 if value is not None:	345 if value is not None:

334 try:	346 try:

335 return self.markdown.serializer(value)	347 return self.markdown.serializer(value)

336 except:	348 except:

337 return '\%s' % value	349 return '\%s' % value

338	350

339 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)	351 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)

340	352

341	353

342 class LinkPattern(Pattern):	354 class LinkPattern(Pattern):

343 """ Return a link element from the given match. """	355 """ Return a link element from the given match. """

344 def handleMatch(self, m):	356 def handleMatch(self, m):

345 el = util.etree.Element("a")	357 el = util.etree.Element("a")

346 el.text = m.group(2)	358 el.text = m.group(2)

347 title = m.group(13)	359 title = m.group(13)

348 href = m.group(9)	360 href = m.group(9)

349	361

350 if href:	362 if href:

351 if href[0] == "<":	363 if href[0] == "<":

352 href = href[1:-1]	364 href = href[1:-1]

353 el.set("href", self.sanitize_url(self.unescape(href.strip())))	365 el.set("href", self.sanitize_url(self.unescape(href.strip())))

354 else:	366 else:

355 el.set("href", "")	367 el.set("href", "")

356	368

357 if title:	369 if title:

358 title = dequote(self.unescape(title))	370 title = dequote(self.unescape(title))

359 el.set("title", title)	371 el.set("title", title)

360 return el	372 return el

361	373

362 def sanitize_url(self, url):	374 def sanitize_url(self, url):

363 """	375 """

364 Sanitize a url against xss attacks in "safe_mode".	376 Sanitize a url against xss attacks in "safe_mode".

365	377

366 Rather than specifically blacklisting `javascript:alert("XSS")` and all	378 Rather than specifically blacklisting `javascript:alert("XSS")` and all

367 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known	379 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known

368 safe url formats. Most urls contain a network location, however some	380 safe url formats. Most urls contain a network location, however some

369 are known not to (i.e.: mailto links). Script urls do not contain a	381 are known not to (i.e.: mailto links). Script urls do not contain a

370 location. Additionally, for `javascript:...`, the scheme would be	382 location. Additionally, for `javascript:...`, the scheme would be

371 "javascript" but some aliases will appear to `urlparse()` to have no	383 "javascript" but some aliases will appear to `urlparse()` to have no

372 scheme. On top of that relative links (i.e.: "foo/bar.html") have no	384 scheme. On top of that relative links (i.e.: "foo/bar.html") have no

373 scheme. Therefore we must check "path", "parameters", "query" and	385 scheme. Therefore we must check "path", "parameters", "query" and

374 "fragment" for any literal colons. We don't check "scheme" for colons	386 "fragment" for any literal colons. We don't check "scheme" for colons

375 because it should never have any and "netloc" must allow the form:	387 because it should never have any and "netloc" must allow the form:

376 `username:password@host:port`.	388 `username:password@host:port`.

377	389

378 """	390 """

379 url = url.replace(' ', '%20')

380 if not self.markdown.safeMode:	391 if not self.markdown.safeMode:

381 # Return immediately bipassing parsing.	392 # Return immediately bipassing parsing.

382 return url	393 return url

383	394

384 try:	395 try:

385 scheme, netloc, path, params, query, fragment = url = urlparse(url)	396 scheme, netloc, path, params, query, fragment = url = urlparse(url)

386 except ValueError:	397 except ValueError: # pragma: no cover

387 # Bad url - so bad it couldn't be parsed.	398 # Bad url - so bad it couldn't be parsed.

388 return ''	399 return ''

389	400

390 locless_schemes = ['', 'mailto', 'news']	401 locless_schemes = ['', 'mailto', 'news']

391 allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']	402 allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']

392 if scheme not in allowed_schemes:	403 if scheme not in allowed_schemes:

393 # Not a known (allowed) scheme. Not safe.	404 # Not a known (allowed) scheme. Not safe.

394 return ''	405 return ''

395	406

396 if netloc == '' and scheme not in locless_schemes:	407 if netloc == '' and scheme not in locless_schemes: # pragma: no cover

397 # This should not happen. Treat as suspect.	408 # This should not happen. Treat as suspect.

398 return ''	409 return ''

399	410

400 for part in url[2:]:	411 for part in url[2:]:

401 if ":" in part:	412 if ":" in part:

402 # A colon in "path", "parameters", "query" or "fragment" is susp ect.	413 # A colon in "path", "parameters", "query"

	414 # or "fragment" is suspect.

403 return ''	415 return ''

404	416

405 # Url passes all tests. Return url as-is.	417 # Url passes all tests. Return url as-is.

406 return urlunparse(url)	418 return urlunparse(url)

407	419

	420

408 class ImagePattern(LinkPattern):	421 class ImagePattern(LinkPattern):

409 """ Return a img element from the given match. """	422 """ Return a img element from the given match. """

410 def handleMatch(self, m):	423 def handleMatch(self, m):

411 el = util.etree.Element("img")	424 el = util.etree.Element("img")

412 src_parts = m.group(9).split()	425 src_parts = m.group(9).split()

413 if src_parts:	426 if src_parts:

414 src = src_parts[0]	427 src = src_parts[0]

415 if src[0] == "<" and src[-1] == ">":	428 if src[0] == "<" and src[-1] == ">":

416 src = src[1:-1]	429 src = src[1:-1]

417 el.set('src', self.sanitize_url(self.unescape(src)))	430 el.set('src', self.sanitize_url(self.unescape(src)))

418 else:	431 else:

419 el.set('src', "")	432 el.set('src', "")

420 if len(src_parts) > 1:	433 if len(src_parts) > 1:

421 el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))	434 el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))

422	435

423 if self.markdown.enable_attributes:	436 if self.markdown.enable_attributes:

424 truealt = handleAttributes(m.group(2), el)	437 truealt = handleAttributes(m.group(2), el)

425 else:	438 else:

426 truealt = m.group(2)	439 truealt = m.group(2)

427	440

428 el.set('alt', self.unescape(truealt))	441 el.set('alt', self.unescape(truealt))

429 return el	442 return el

430	443

	444

431 class ReferencePattern(LinkPattern):	445 class ReferencePattern(LinkPattern):

432 """ Match to a stored reference and return link element. """	446 """ Match to a stored reference and return link element. """

433	447

434 NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)	448 NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)

435	449

436 def handleMatch(self, m):	450 def handleMatch(self, m):

437 try:	451 try:

438 id = m.group(9).lower()	452 id = m.group(9).lower()

439 except IndexError:	453 except IndexError:

440 id = None	454 id = None

441 if not id:	455 if not id:

442 # if we got something like "[Google][]" or "[Goggle]"	456 # if we got something like "[Google][]" or "[Goggle]"

443 # we'll use "google" as the id	457 # we'll use "google" as the id

444 id = m.group(2).lower()	458 id = m.group(2).lower()

445	459

446 # Clean up linebreaks in id	460 # Clean up linebreaks in id

447 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)	461 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)

448 if not id in self.markdown.references: # ignore undefined refs	462 if id not in self.markdown.references: # ignore undefined refs

449 return None	463 return None

450 href, title = self.markdown.references[id]	464 href, title = self.markdown.references[id]

451	465

452 text = m.group(2)	466 text = m.group(2)

453 return self.makeTag(href, title, text)	467 return self.makeTag(href, title, text)

454	468

455 def makeTag(self, href, title, text):	469 def makeTag(self, href, title, text):

456 el = util.etree.Element('a')	470 el = util.etree.Element('a')

457	471

458 el.set('href', self.sanitize_url(href))	472 el.set('href', self.sanitize_url(href))

(...skipping 20 matching lines...) Expand all Loading...
479	493

480	494

481 class AutolinkPattern(Pattern):	495 class AutolinkPattern(Pattern):

482 """ Return a link Element given an autolink (`<http://example/com>`). """	496 """ Return a link Element given an autolink (`<http://example/com>`). """

483 def handleMatch(self, m):	497 def handleMatch(self, m):

484 el = util.etree.Element("a")	498 el = util.etree.Element("a")

485 el.set('href', self.unescape(m.group(2)))	499 el.set('href', self.unescape(m.group(2)))

486 el.text = util.AtomicString(m.group(2))	500 el.text = util.AtomicString(m.group(2))

487 return el	501 return el

488	502

	503

489 class AutomailPattern(Pattern):	504 class AutomailPattern(Pattern):

490 """	505 """

491 Return a mailto link Element given an automail link (`<foo@example.com>`).	506 Return a mailto link Element given an automail link (`<foo@example.com>`).

492 """	507 """

493 def handleMatch(self, m):	508 def handleMatch(self, m):

494 el = util.etree.Element('a')	509 el = util.etree.Element('a')

495 email = self.unescape(m.group(2))	510 email = self.unescape(m.group(2))

496 if email.startswith("mailto:"):	511 if email.startswith("mailto:"):

497 email = email[len("mailto:"):]	512 email = email[len("mailto:"):]

498	513

499 def codepoint2name(code):	514 def codepoint2name(code):

500 """Return entity definition by code, or the code if not defined."""	515 """Return entity definition by code, or the code if not defined."""

501 entity = entities.codepoint2name.get(code)	516 entity = entities.codepoint2name.get(code)

502 if entity:	517 if entity:

503 return "%s%s;" % (util.AMP_SUBSTITUTE, entity)	518 return "%s%s;" % (util.AMP_SUBSTITUTE, entity)

504 else:	519 else:

505 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)	520 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)

506	521

507 letters = [codepoint2name(ord(letter)) for letter in email]	522 letters = [codepoint2name(ord(letter)) for letter in email]

508 el.text = util.AtomicString(''.join(letters))	523 el.text = util.AtomicString(''.join(letters))

509	524

510 mailto = "mailto:" + email	525 mailto = "mailto:" + email

511 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %	526 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %

512 ord(letter) for letter in mailto])	527 ord(letter) for letter in mailto])

513 el.set('href', mailto)	528 el.set('href', mailto)

514 return el	529 return el

515

OLD	NEW