third_party/Python-Markdown/markdown/inlinepatterns.py - Issue 1392733002: Re-land "Check in a simple pure-python based Markdown previewer."

Side by Side Diff: third_party/Python-Markdown/markdown/inlinepatterns.py

Issue 1392733002: Re-land "Check in a simple pure-python based Markdown previewer." (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: clarify comment re: licenses, add bug #, use --no-find-copies Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 """

	2 INLINE PATTERNS

	3 =============================================================================

	4

	5 Inline patterns such as emphasis are handled by means of auxiliary

	6 objects, one per pattern. Pattern objects must be instances of classes

	7 that extend markdown.Pattern. Each pattern object uses a single regular

	8 expression and needs support the following methods:

	9

	10 pattern.getCompiledRegExp() # returns a regular expression

	11

	12 pattern.handleMatch(m) # takes a match object and returns

	13 # an ElementTree element or just plain text

	14

	15 All of python markdown's built-in patterns subclass from Pattern,

	16 but you can add additional patterns that don't.

	17

	18 Also note that all the regular expressions used by inline must

	19 capture the whole block. For this reason, they all start with

	20 '^(.)' and end with '(.)!'. In case with built-in expression

	21 Pattern takes care of adding the "^(.)" and "(.)!".

	22

	23 Finally, the order in which regular expressions are applied is very

	24 important - e.g. if we first replace http://.../ links with <a> tags

	25 and _then_ try to replace inline html, we would end up with a mess.

	26 So, we apply the expressions in the following order:

	27

	28 * escape and backticks have to go before everything else, so

	29 that we can preempt any markdown patterns by escaping them.

	30

	31 * then we handle auto-links (must be done before inline html)

	32

	33 * then we handle inline HTML. At this point we will simply

	34 replace all inline HTML strings with a placeholder and add

	35 the actual HTML to a hash.

	36

	37 * then inline images (must be done before links)

	38

	39 * then bracketed links, first regular then reference-style

	40

	41 * finally we apply strong and emphasis

	42 """

	43

	44 from __future__ import absolute_import

	45 from __future__ import unicode_literals

	46 from . import util

	47 from . import odict

	48 import re

	49 try: # pragma: no cover

	50 from urllib.parse import urlparse, urlunparse

	51 except ImportError: # pragma: no cover

	52 from urlparse import urlparse, urlunparse

	53 try: # pragma: no cover

	54 from html import entities

	55 except ImportError: # pragma: no cover

	56 import htmlentitydefs as entities

	57

	58

	59 def build_inlinepatterns(md_instance, **kwargs):

	60 """ Build the default set of inline patterns for Markdown. """

	61 inlinePatterns = odict.OrderedDict()

	62 inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)

	63 inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)

	64 inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)

	65 inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)

	66 inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)

	67 inlinePatterns["image_reference"] = ImageReferencePattern(

	68 IMAGE_REFERENCE_RE, md_instance

	69 )

	70 inlinePatterns["short_reference"] = ReferencePattern(

	71 SHORT_REF_RE, md_instance

	72 )

	73 inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)

	74 inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)

	75 inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')

	76 if md_instance.safeMode != 'escape':

	77 inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)

	78 inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)

	79 inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)

	80 inlinePatterns["em_strong"] = DoubleTagPattern(EM_STRONG_RE, 'strong,em')

	81 inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'em,strong')

	82 inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')

	83 inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')

	84 if md_instance.smart_emphasis:

	85 inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')

	86 else:

	87 inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')

	88 return inlinePatterns

	89

	90 """

	91 The actual regular expressions for patterns

	92 -----------------------------------------------------------------------------

	93 """

	94

	95 NOBRACKET = r'[^\]\[]*'

	96 BRK = (

	97 r'\[(' +

	98 (NOBRACKET + r'(\[')*6 +

	99 (NOBRACKET + r'\])')6 +

	100 NOBRACKET + r')\]'

	101 )

	102 NOIMG = r'(?<!\!)'

	103

	104 # `e=f()` or ``e=f("`")``

	105 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)'

	106

	107 # \<

	108 ESCAPE_RE = r'\\(.)'

	109

	110 # emphasis

	111 EMPHASIS_RE = r'(\)([^\]+)\2'

	112

	113 # strong

	114 STRONG_RE = r'(\*{2}\|_{2})(.+?)\2'

	115

	116 # *strongem* or **emstrong**

	117 EM_STRONG_RE = r'(\\|_)\2{2}(.+?)\2(.?)\2{2}'

	118

	119 # *strongem*

	120 STRONG_EM_RE = r'(\\|_)\2{2}(.+?)\2{2}(.?)\2'

	121

	122 # _smart_emphasis_

	123 SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)'

	124

	125 # _emphasis_

	126 EMPHASIS_2_RE = r'(_)(.+?)\2'

	127

	128 # [text](url) or [text](<url>) or [text](url "title")

	129 LINK_RE = NOIMG + BRK + \

	130 r'''$\s(<.?>\|((?:(?:\(.?$)\|[^]))?)\s((['"])(.?)\12\s*)?\)'''

	131

	132 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)

	133 IMAGE_LINK_RE = r'\!' + BRK + r'\s$(<.?>\|([^")]+"[^"]"\|[^$]))\)'

	134

	135 # [Google][3]

	136 REFERENCE_RE = NOIMG + BRK + r'\s?\[([^\]]*)\]'

	137

	138 # [Google]

	139 SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]'

	140

	141 # ![alt text][2]

	142 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]'

	143

	144 # stand-alone * or _

	145 NOT_STRONG_RE = r'((^\| )(\*\|_)( \|$))'

	146

	147 # <http://www.123.com>

	148 AUTOLINK_RE = r'<((?:[Ff]\|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>'

	149

	150 # <me@example.com>

	151 AUTOMAIL_RE = r'<([^> \!]@[^> ])>'

	152

	153 # <...>

	154 HTML_RE = r'(\<([a-zA-Z/][^\>]?\|\!--.?--)\>)'

	155

	156 # &

	157 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'

	158

	159 # two spaces at end of line

	160 LINE_BREAK_RE = r' \n'

	161

	162

	163 def dequote(string):

	164 """Remove quotes from around a string."""

	165 if ((string.startswith('"') and string.endswith('"')) or

	166 (string.startswith("'") and string.endswith("'"))):

	167 return string[1:-1]

	168 else:

	169 return string

	170

	171

	172 ATTR_RE = re.compile("\{@([^\}])=([^\}])}") # {@id=123}

	173

	174

	175 def handleAttributes(text, parent):

	176 """Set values of an element based on attribute definitions ({@id=123})."""

	177 def attributeCallback(match):

	178 parent.set(match.group(1), match.group(2).replace('\n', ' '))

	179 return ATTR_RE.sub(attributeCallback, text)

	180

	181

	182 """

	183 The pattern classes

	184 -----------------------------------------------------------------------------

	185 """

	186

	187

	188 class Pattern(object):

	189 """Base class that inline patterns subclass. """

	190

	191 def __init__(self, pattern, markdown_instance=None):

	192 """

	193 Create an instant of an inline pattern.

	194

	195 Keyword arguments:

	196

	197 * pattern: A regular expression that matches a pattern

	198

	199 """

	200 self.pattern = pattern

	201 self.compiled_re = re.compile("^(.?)%s(.?)$" % pattern,

	202 re.DOTALL \| re.UNICODE)

	203

	204 # Api for Markdown to pass safe_mode into instance

	205 self.safe_mode = False

	206 if markdown_instance:

	207 self.markdown = markdown_instance

	208

	209 def getCompiledRegExp(self):

	210 """ Return a compiled regular expression. """

	211 return self.compiled_re

	212

	213 def handleMatch(self, m):

	214 """Return a ElementTree element from the given match.

	215

	216 Subclasses should override this method.

	217

	218 Keyword arguments:

	219

	220 * m: A re match object containing a match of the pattern.

	221

	222 """

	223 pass # pragma: no cover

	224

	225 def type(self):

	226 """ Return class name, to define pattern type """

	227 return self.__class__.__name__

	228

	229 def unescape(self, text):

	230 """ Return unescaped text given text with an inline placeholder. """

	231 try:

	232 stash = self.markdown.treeprocessors['inline'].stashed_nodes

	233 except KeyError: # pragma: no cover

	234 return text

	235

	236 def itertext(el): # pragma: no cover

	237 ' Reimplement Element.itertext for older python versions '

	238 tag = el.tag

	239 if not isinstance(tag, util.string_type) and tag is not None:

	240 return

	241 if el.text:

	242 yield el.text

	243 for e in el:

	244 for s in itertext(e):

	245 yield s

	246 if e.tail:

	247 yield e.tail

	248

	249 def get_stash(m):

	250 id = m.group(1)

	251 if id in stash:

	252 value = stash.get(id)

	253 if isinstance(value, util.string_type):

	254 return value

	255 else:

	256 # An etree Element - return text content only

	257 return ''.join(itertext(value))

	258 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)

	259

	260

	261 class SimpleTextPattern(Pattern):

	262 """ Return a simple text of group(2) of a Pattern. """

	263 def handleMatch(self, m):

	264 return m.group(2)

	265

	266

	267 class EscapePattern(Pattern):

	268 """ Return an escaped character. """

	269

	270 def handleMatch(self, m):

	271 char = m.group(2)

	272 if char in self.markdown.ESCAPED_CHARS:

	273 return '%s%s%s' % (util.STX, ord(char), util.ETX)

	274 else:

	275 return None

	276

	277

	278 class SimpleTagPattern(Pattern):

	279 """

	280 Return element of type `tag` with a text attribute of group(3)

	281 of a Pattern.

	282

	283 """

	284 def __init__(self, pattern, tag):

	285 Pattern.__init__(self, pattern)

	286 self.tag = tag

	287

	288 def handleMatch(self, m):

	289 el = util.etree.Element(self.tag)

	290 el.text = m.group(3)

	291 return el

	292

	293

	294 class SubstituteTagPattern(SimpleTagPattern):

	295 """ Return an element of type `tag` with no children. """

	296 def handleMatch(self, m):

	297 return util.etree.Element(self.tag)

	298

	299

	300 class BacktickPattern(Pattern):

	301 """ Return a `<code>` element containing the matching text. """

	302 def __init__(self, pattern):

	303 Pattern.__init__(self, pattern)

	304 self.tag = "code"

	305

	306 def handleMatch(self, m):

	307 el = util.etree.Element(self.tag)

	308 el.text = util.AtomicString(m.group(3).strip())

	309 return el

	310

	311

	312 class DoubleTagPattern(SimpleTagPattern):

	313 """Return a ElementTree element nested in tag2 nested in tag1.

	314

	315 Useful for strong emphasis etc.

	316

	317 """

	318 def handleMatch(self, m):

	319 tag1, tag2 = self.tag.split(",")

	320 el1 = util.etree.Element(tag1)

	321 el2 = util.etree.SubElement(el1, tag2)

	322 el2.text = m.group(3)

	323 if len(m.groups()) == 5:

	324 el2.tail = m.group(4)

	325 return el1

	326

	327

	328 class HtmlPattern(Pattern):

	329 """ Store raw inline html and return a placeholder. """

	330 def handleMatch(self, m):

	331 rawhtml = self.unescape(m.group(2))

	332 place_holder = self.markdown.htmlStash.store(rawhtml)

	333 return place_holder

	334

	335 def unescape(self, text):

	336 """ Return unescaped text given text with an inline placeholder. """

	337 try:

	338 stash = self.markdown.treeprocessors['inline'].stashed_nodes

	339 except KeyError: # pragma: no cover

	340 return text

	341

	342 def get_stash(m):

	343 id = m.group(1)

	344 value = stash.get(id)

	345 if value is not None:

	346 try:

	347 return self.markdown.serializer(value)

	348 except:

	349 return '\%s' % value

	350

	351 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)

	352

	353

	354 class LinkPattern(Pattern):

	355 """ Return a link element from the given match. """

	356 def handleMatch(self, m):

	357 el = util.etree.Element("a")

	358 el.text = m.group(2)

	359 title = m.group(13)

	360 href = m.group(9)

	361

	362 if href:

	363 if href[0] == "<":

	364 href = href[1:-1]

	365 el.set("href", self.sanitize_url(self.unescape(href.strip())))

	366 else:

	367 el.set("href", "")

	368

	369 if title:

	370 title = dequote(self.unescape(title))

	371 el.set("title", title)

	372 return el

	373

	374 def sanitize_url(self, url):

	375 """

	376 Sanitize a url against xss attacks in "safe_mode".

	377

	378 Rather than specifically blacklisting `javascript:alert("XSS")` and all

	379 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known

	380 safe url formats. Most urls contain a network location, however some

	381 are known not to (i.e.: mailto links). Script urls do not contain a

	382 location. Additionally, for `javascript:...`, the scheme would be

	383 "javascript" but some aliases will appear to `urlparse()` to have no

	384 scheme. On top of that relative links (i.e.: "foo/bar.html") have no

	385 scheme. Therefore we must check "path", "parameters", "query" and

	386 "fragment" for any literal colons. We don't check "scheme" for colons

	387 because it should never have any and "netloc" must allow the form:

	388 `username:password@host:port`.

	389

	390 """

	391 if not self.markdown.safeMode:

	392 # Return immediately bipassing parsing.

	393 return url

	394

	395 try:

	396 scheme, netloc, path, params, query, fragment = url = urlparse(url)

	397 except ValueError: # pragma: no cover

	398 # Bad url - so bad it couldn't be parsed.

	399 return ''

	400

	401 locless_schemes = ['', 'mailto', 'news']

	402 allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']

	403 if scheme not in allowed_schemes:

	404 # Not a known (allowed) scheme. Not safe.

	405 return ''

	406

	407 if netloc == '' and scheme not in locless_schemes: # pragma: no cover

	408 # This should not happen. Treat as suspect.

	409 return ''

	410

	411 for part in url[2:]:

	412 if ":" in part:

	413 # A colon in "path", "parameters", "query"

	414 # or "fragment" is suspect.

	415 return ''

	416

	417 # Url passes all tests. Return url as-is.

	418 return urlunparse(url)

	419

	420

	421 class ImagePattern(LinkPattern):

	422 """ Return a img element from the given match. """

	423 def handleMatch(self, m):

	424 el = util.etree.Element("img")

	425 src_parts = m.group(9).split()

	426 if src_parts:

	427 src = src_parts[0]

	428 if src[0] == "<" and src[-1] == ">":

	429 src = src[1:-1]

	430 el.set('src', self.sanitize_url(self.unescape(src)))

	431 else:

	432 el.set('src', "")

	433 if len(src_parts) > 1:

	434 el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))

	435

	436 if self.markdown.enable_attributes:

	437 truealt = handleAttributes(m.group(2), el)

	438 else:

	439 truealt = m.group(2)

	440

	441 el.set('alt', self.unescape(truealt))

	442 return el

	443

	444

	445 class ReferencePattern(LinkPattern):

	446 """ Match to a stored reference and return link element. """

	447

	448 NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)

	449

	450 def handleMatch(self, m):

	451 try:

	452 id = m.group(9).lower()

	453 except IndexError:

	454 id = None

	455 if not id:

	456 # if we got something like "[Google][]" or "[Goggle]"

	457 # we'll use "google" as the id

	458 id = m.group(2).lower()

	459

	460 # Clean up linebreaks in id

	461 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)

	462 if id not in self.markdown.references: # ignore undefined refs

	463 return None

	464 href, title = self.markdown.references[id]

	465

	466 text = m.group(2)

	467 return self.makeTag(href, title, text)

	468

	469 def makeTag(self, href, title, text):

	470 el = util.etree.Element('a')

	471

	472 el.set('href', self.sanitize_url(href))

	473 if title:

	474 el.set('title', title)

	475

	476 el.text = text

	477 return el

	478

	479

	480 class ImageReferencePattern(ReferencePattern):

	481 """ Match to a stored reference and return img element. """

	482 def makeTag(self, href, title, text):

	483 el = util.etree.Element("img")

	484 el.set("src", self.sanitize_url(href))

	485 if title:

	486 el.set("title", title)

	487

	488 if self.markdown.enable_attributes:

	489 text = handleAttributes(text, el)

	490

	491 el.set("alt", self.unescape(text))

	492 return el

	493

	494

	495 class AutolinkPattern(Pattern):

	496 """ Return a link Element given an autolink (`<http://example/com>`). """

	497 def handleMatch(self, m):

	498 el = util.etree.Element("a")

	499 el.set('href', self.unescape(m.group(2)))

	500 el.text = util.AtomicString(m.group(2))

	501 return el

	502

	503

	504 class AutomailPattern(Pattern):

	505 """

	506 Return a mailto link Element given an automail link (`<foo@example.com>`).

	507 """

	508 def handleMatch(self, m):

	509 el = util.etree.Element('a')

	510 email = self.unescape(m.group(2))

	511 if email.startswith("mailto:"):

	512 email = email[len("mailto:"):]

	513

	514 def codepoint2name(code):

	515 """Return entity definition by code, or the code if not defined."""

	516 entity = entities.codepoint2name.get(code)

	517 if entity:

	518 return "%s%s;" % (util.AMP_SUBSTITUTE, entity)

	519 else:

	520 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)

	521

	522 letters = [codepoint2name(ord(letter)) for letter in email]

	523 el.text = util.AtomicString(''.join(letters))

	524

	525 mailto = "mailto:" + email

	526 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %

	527 ord(letter) for letter in mailto])

	528 el.set('href', mailto)

	529 return el

OLD	NEW

« no previous file with comments | « third_party/Python-Markdown/markdown/extensions/wikilinks.py ('k') | third_party/Python-Markdown/markdown/odict.py » ('j') | no next file with comments »