third_party/markdown/inlinepatterns.py - Issue 93743005: Support markdown template for html editor

Side by Side Diff: third_party/markdown/inlinepatterns.py

Issue 93743005: Support markdown template for html editor (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: fix path without dir Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 """

	2 INLINE PATTERNS

	3 =============================================================================

	4

	5 Inline patterns such as emphasis are handled by means of auxiliary

	6 objects, one per pattern. Pattern objects must be instances of classes

	7 that extend markdown.Pattern. Each pattern object uses a single regular

	8 expression and needs support the following methods:

	9

	10 pattern.getCompiledRegExp() # returns a regular expression

	11

	12 pattern.handleMatch(m) # takes a match object and returns

	13 # an ElementTree element or just plain text

	14

	15 All of python markdown's built-in patterns subclass from Pattern,

	16 but you can add additional patterns that don't.

	17

	18 Also note that all the regular expressions used by inline must

	19 capture the whole block. For this reason, they all start with

	20 '^(.)' and end with '(.)!'. In case with built-in expression

	21 Pattern takes care of adding the "^(.)" and "(.)!".

	22

	23 Finally, the order in which regular expressions are applied is very

	24 important - e.g. if we first replace http://.../ links with <a> tags

	25 and _then_ try to replace inline html, we would end up with a mess.

	26 So, we apply the expressions in the following order:

	27

	28 * escape and backticks have to go before everything else, so

	29 that we can preempt any markdown patterns by escaping them.

	30

	31 * then we handle auto-links (must be done before inline html)

	32

	33 * then we handle inline HTML. At this point we will simply

	34 replace all inline HTML strings with a placeholder and add

	35 the actual HTML to a hash.

	36

	37 * then inline images (must be done before links)

	38

	39 * then bracketed links, first regular then reference-style

	40

	41 * finally we apply strong and emphasis

	42 """

	43

	44 from __future__ import absolute_import

	45 from __future__ import unicode_literals

	46 from . import util

	47 from . import odict

	48 import re

	49 try:

	50 from urllib.parse import urlparse, urlunparse

	51 except ImportError:

	52 from urlparse import urlparse, urlunparse

	53 try:

	54 from html import entities

	55 except ImportError:

	56 import htmlentitydefs as entities

	57

	58

	59 def build_inlinepatterns(md_instance, **kwargs):

	60 """ Build the default set of inline patterns for Markdown. """

	61 inlinePatterns = odict.OrderedDict()

	62 inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)

	63 inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)

	64 inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)

	65 inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)

	66 inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)

	67 inlinePatterns["image_reference"] = \

	68 ImageReferencePattern(IMAGE_REFERENCE_RE, md_instance)

	69 inlinePatterns["short_reference"] = \

	70 ReferencePattern(SHORT_REF_RE, md_instance)

	71 inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)

	72 inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)

	73 inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')

	74 if md_instance.safeMode != 'escape':

	75 inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)

	76 inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)

	77 inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)

	78 inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'strong,em')

	79 inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')

	80 inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')

	81 if md_instance.smart_emphasis:

	82 inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')

	83 else:

	84 inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')

	85 return inlinePatterns

	86

	87 """

	88 The actual regular expressions for patterns

	89 -----------------------------------------------------------------------------

	90 """

	91

	92 NOBRACKET = r'[^\]\[]*'

	93 BRK = ( r'\[('

	94 + (NOBRACKET + r'(\[')*6

	95 + (NOBRACKET+ r'\])')6

	96 + NOBRACKET + r')\]' )

	97 NOIMG = r'(?<!\!)'

	98

	99 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``

	100 ESCAPE_RE = r'\\(.)' # \<

	101 EMPHASIS_RE = r'(\)([^\]+)\2' # emphasis

	102 STRONG_RE = r'(\{2}\|_{2})(.+?)\2' # strong*

	103 STRONG_EM_RE = r'(\{3}\|_{3})(.+?)\2' # strong*

	104 SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)' # _smart_emphasis_

	105 EMPHASIS_2_RE = r'(_)(.+?)\2' # _emphasis_

	106 LINK_RE = NOIMG + BRK + \

	107 r'''$\s(<.?>\|((?:(?:\(.?$)\|[^]))?)\s((['"])(.?)\12\s*)?\)'''

	108 # [text](url) or [text](<url>) or [text](url "title")

	109

	110 IMAGE_LINK_RE = r'\!' + BRK + r'\s$(<.?>\|([^$]*))\)'

	111 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)

	112 REFERENCE_RE = NOIMG + BRK+ r'\s?\[([^\]]*)\]' # [Google][3]

	113 SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]' # [Google]

	114 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]' # ![alt text][2]

	115 NOT_STRONG_RE = r'((^\| )(\\|_)( \|$))' # stand-alone or _

	116 AUTOLINK_RE = r'<((?:[Ff]\|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>' # <http://www.123.co m>

	117 AUTOMAIL_RE = r'<([^> \!]@[^> ])>' # <me@example.com>

	118

	119 HTML_RE = r'(\<([a-zA-Z/][^\>]?\|\!--.?--)\>)' # <...>

	120 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &

	121 LINE_BREAK_RE = r' \n' # two spaces at end of line

	122

	123

	124 def dequote(string):

	125 """Remove quotes from around a string."""

	126 if ( ( string.startswith('"') and string.endswith('"'))

	127 or (string.startswith("'") and string.endswith("'")) ):

	128 return string[1:-1]

	129 else:

	130 return string

	131

	132 ATTR_RE = re.compile("\{@([^\}])=([^\}])}") # {@id=123}

	133

	134 def handleAttributes(text, parent):

	135 """Set values of an element based on attribute definitions ({@id=123})."""

	136 def attributeCallback(match):

	137 parent.set(match.group(1), match.group(2).replace('\n', ' '))

	138 return ATTR_RE.sub(attributeCallback, text)

	139

	140

	141 """

	142 The pattern classes

	143 -----------------------------------------------------------------------------

	144 """

	145

	146 class Pattern(object):

	147 """Base class that inline patterns subclass. """

	148

	149 def __init__(self, pattern, markdown_instance=None):

	150 """

	151 Create an instant of an inline pattern.

	152

	153 Keyword arguments:

	154

	155 * pattern: A regular expression that matches a pattern

	156

	157 """

	158 self.pattern = pattern

	159 self.compiled_re = re.compile("^(.?)%s(.?)$" % pattern,

	160 re.DOTALL \| re.UNICODE)

	161

	162 # Api for Markdown to pass safe_mode into instance

	163 self.safe_mode = False

	164 if markdown_instance:

	165 self.markdown = markdown_instance

	166

	167 def getCompiledRegExp(self):

	168 """ Return a compiled regular expression. """

	169 return self.compiled_re

	170

	171 def handleMatch(self, m):

	172 """Return a ElementTree element from the given match.

	173

	174 Subclasses should override this method.

	175

	176 Keyword arguments:

	177

	178 * m: A re match object containing a match of the pattern.

	179

	180 """

	181 pass

	182

	183 def type(self):

	184 """ Return class name, to define pattern type """

	185 return self.__class__.__name__

	186

	187 def unescape(self, text):

	188 """ Return unescaped text given text with an inline placeholder. """

	189 try:

	190 stash = self.markdown.treeprocessors['inline'].stashed_nodes

	191 except KeyError:

	192 return text

	193 def itertext(el):

	194 ' Reimplement Element.itertext for older python versions '

	195 tag = el.tag

	196 if not isinstance(tag, util.string_type) and tag is not None:

	197 return

	198 if el.text:

	199 yield el.text

	200 for e in el:

	201 for s in itertext(e):

	202 yield s

	203 if e.tail:

	204 yield e.tail

	205 def get_stash(m):

	206 id = m.group(1)

	207 if id in stash:

	208 value = stash.get(id)

	209 if isinstance(value, util.string_type):

	210 return value

	211 else:

	212 # An etree Element - return text content only

	213 return ''.join(itertext(value))

	214 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)

	215

	216

	217 class SimpleTextPattern(Pattern):

	218 """ Return a simple text of group(2) of a Pattern. """

	219 def handleMatch(self, m):

	220 text = m.group(2)

	221 if text == util.INLINE_PLACEHOLDER_PREFIX:

	222 return None

	223 return text

	224

	225

	226 class EscapePattern(Pattern):

	227 """ Return an escaped character. """

	228

	229 def handleMatch(self, m):

	230 char = m.group(2)

	231 if char in self.markdown.ESCAPED_CHARS:

	232 return '%s%s%s' % (util.STX, ord(char), util.ETX)

	233 else:

	234 return '\\%s' % char

	235

	236

	237 class SimpleTagPattern(Pattern):

	238 """

	239 Return element of type `tag` with a text attribute of group(3)

	240 of a Pattern.

	241

	242 """

	243 def __init__ (self, pattern, tag):

	244 Pattern.__init__(self, pattern)

	245 self.tag = tag

	246

	247 def handleMatch(self, m):

	248 el = util.etree.Element(self.tag)

	249 el.text = m.group(3)

	250 return el

	251

	252

	253 class SubstituteTagPattern(SimpleTagPattern):

	254 """ Return an element of type `tag` with no children. """

	255 def handleMatch (self, m):

	256 return util.etree.Element(self.tag)

	257

	258

	259 class BacktickPattern(Pattern):

	260 """ Return a `<code>` element containing the matching text. """

	261 def __init__ (self, pattern):

	262 Pattern.__init__(self, pattern)

	263 self.tag = "code"

	264

	265 def handleMatch(self, m):

	266 el = util.etree.Element(self.tag)

	267 el.text = util.AtomicString(m.group(3).strip())

	268 return el

	269

	270

	271 class DoubleTagPattern(SimpleTagPattern):

	272 """Return a ElementTree element nested in tag2 nested in tag1.

	273

	274 Useful for strong emphasis etc.

	275

	276 """

	277 def handleMatch(self, m):

	278 tag1, tag2 = self.tag.split(",")

	279 el1 = util.etree.Element(tag1)

	280 el2 = util.etree.SubElement(el1, tag2)

	281 el2.text = m.group(3)

	282 return el1

	283

	284

	285 class HtmlPattern(Pattern):

	286 """ Store raw inline html and return a placeholder. """

	287 def handleMatch (self, m):

	288 rawhtml = self.unescape(m.group(2))

	289 place_holder = self.markdown.htmlStash.store(rawhtml)

	290 return place_holder

	291

	292 def unescape(self, text):

	293 """ Return unescaped text given text with an inline placeholder. """

	294 try:

	295 stash = self.markdown.treeprocessors['inline'].stashed_nodes

	296 except KeyError:

	297 return text

	298 def get_stash(m):

	299 id = m.group(1)

	300 value = stash.get(id)

	301 if value is not None:

	302 try:

	303 return self.markdown.serializer(value)

	304 except:

	305 return '\%s' % value

	306

	307 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)

	308

	309

	310 class LinkPattern(Pattern):

	311 """ Return a link element from the given match. """

	312 def handleMatch(self, m):

	313 el = util.etree.Element("a")

	314 el.text = m.group(2)

	315 title = m.group(13)

	316 href = m.group(9)

	317

	318 if href:

	319 if href[0] == "<":

	320 href = href[1:-1]

	321 el.set("href", self.sanitize_url(self.unescape(href.strip())))

	322 else:

	323 el.set("href", "")

	324

	325 if title:

	326 title = dequote(self.unescape(title))

	327 el.set("title", title)

	328 return el

	329

	330 def sanitize_url(self, url):

	331 """

	332 Sanitize a url against xss attacks in "safe_mode".

	333

	334 Rather than specifically blacklisting `javascript:alert("XSS")` and all

	335 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known

	336 safe url formats. Most urls contain a network location, however some

	337 are known not to (i.e.: mailto links). Script urls do not contain a

	338 location. Additionally, for `javascript:...`, the scheme would be

	339 "javascript" but some aliases will appear to `urlparse()` to have no

	340 scheme. On top of that relative links (i.e.: "foo/bar.html") have no

	341 scheme. Therefore we must check "path", "parameters", "query" and

	342 "fragment" for any literal colons. We don't check "scheme" for colons

	343 because it should never have any and "netloc" must allow the form:

	344 `username:password@host:port`.

	345

	346 """

	347 url = url.replace(' ', '%20')

	348 if not self.markdown.safeMode:

	349 # Return immediately bipassing parsing.

	350 return url

	351

	352 try:

	353 scheme, netloc, path, params, query, fragment = url = urlparse(url)

	354 except ValueError:

	355 # Bad url - so bad it couldn't be parsed.

	356 return ''

	357

	358 locless_schemes = ['', 'mailto', 'news']

	359 allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']

	360 if scheme not in allowed_schemes:

	361 # Not a known (allowed) scheme. Not safe.

	362 return ''

	363

	364 if netloc == '' and scheme not in locless_schemes:

	365 # This should not happen. Treat as suspect.

	366 return ''

	367

	368 for part in url[2:]:

	369 if ":" in part:

	370 # A colon in "path", "parameters", "query" or "fragment" is susp ect.

	371 return ''

	372

	373 # Url passes all tests. Return url as-is.

	374 return urlunparse(url)

	375

	376 class ImagePattern(LinkPattern):

	377 """ Return a img element from the given match. """

	378 def handleMatch(self, m):

	379 el = util.etree.Element("img")

	380 src_parts = m.group(9).split()

	381 if src_parts:

	382 src = src_parts[0]

	383 if src[0] == "<" and src[-1] == ">":

	384 src = src[1:-1]

	385 el.set('src', self.sanitize_url(self.unescape(src)))

	386 else:

	387 el.set('src', "")

	388 if len(src_parts) > 1:

	389 el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))

	390

	391 if self.markdown.enable_attributes:

	392 truealt = handleAttributes(m.group(2), el)

	393 else:

	394 truealt = m.group(2)

	395

	396 el.set('alt', self.unescape(truealt))

	397 return el

	398

	399 class ReferencePattern(LinkPattern):

	400 """ Match to a stored reference and return link element. """

	401

	402 NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)

	403

	404 def handleMatch(self, m):

	405 try:

	406 id = m.group(9).lower()

	407 except IndexError:

	408 id = None

	409 if not id:

	410 # if we got something like "[Google][]" or "[Goggle]"

	411 # we'll use "google" as the id

	412 id = m.group(2).lower()

	413

	414 # Clean up linebreaks in id

	415 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)

	416 if not id in self.markdown.references: # ignore undefined refs

	417 return None

	418 href, title = self.markdown.references[id]

	419

	420 text = m.group(2)

	421 return self.makeTag(href, title, text)

	422

	423 def makeTag(self, href, title, text):

	424 el = util.etree.Element('a')

	425

	426 el.set('href', self.sanitize_url(href))

	427 if title:

	428 el.set('title', title)

	429

	430 el.text = text

	431 return el

	432

	433

	434 class ImageReferencePattern(ReferencePattern):

	435 """ Match to a stored reference and return img element. """

	436 def makeTag(self, href, title, text):

	437 el = util.etree.Element("img")

	438 el.set("src", self.sanitize_url(href))

	439 if title:

	440 el.set("title", title)

	441

	442 if self.markdown.enable_attributes:

	443 text = handleAttributes(text, el)

	444

	445 el.set("alt", self.unescape(text))

	446 return el

	447

	448

	449 class AutolinkPattern(Pattern):

	450 """ Return a link Element given an autolink (`<http://example/com>`). """

	451 def handleMatch(self, m):

	452 el = util.etree.Element("a")

	453 el.set('href', self.unescape(m.group(2)))

	454 el.text = util.AtomicString(m.group(2))

	455 return el

	456

	457 class AutomailPattern(Pattern):

	458 """

	459 Return a mailto link Element given an automail link (`<foo@example.com>`).

	460 """

	461 def handleMatch(self, m):

	462 el = util.etree.Element('a')

	463 email = self.unescape(m.group(2))

	464 if email.startswith("mailto:"):

	465 email = email[len("mailto:"):]

	466

	467 def codepoint2name(code):

	468 """Return entity definition by code, or the code if not defined."""

	469 entity = entities.codepoint2name.get(code)

	470 if entity:

	471 return "%s%s;" % (util.AMP_SUBSTITUTE, entity)

	472 else:

	473 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)

	474

	475 letters = [codepoint2name(ord(letter)) for letter in email]

	476 el.text = util.AtomicString(''.join(letters))

	477

	478 mailto = "mailto:" + email

	479 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %

	480 ord(letter) for letter in mailto])

	481 el.set('href', mailto)

	482 return el

	483

OLD	NEW

« no previous file with comments | « third_party/markdown/extensions/wikilinks.py ('k') | third_party/markdown/odict.py » ('j') | no next file with comments »