third_party/Python-Markdown/markdown/preprocessors.py - Issue 1392733002: Re-land "Check in a simple pure-python based Markdown previewer."

Side by Side Diff: third_party/Python-Markdown/markdown/preprocessors.py

Issue 1392733002: Re-land "Check in a simple pure-python based Markdown previewer." (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: clarify comment re: licenses, add bug #, use --no-find-copies Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 """

	2 PRE-PROCESSORS

	3 =============================================================================

	4

	5 Preprocessors work on source text before we start doing anything too

	6 complicated.

	7 """

	8

	9 from __future__ import absolute_import

	10 from __future__ import unicode_literals

	11 from . import util

	12 from . import odict

	13 import re

	14

	15

	16 def build_preprocessors(md_instance, **kwargs):

	17 """ Build the default set of preprocessors used by Markdown. """

	18 preprocessors = odict.OrderedDict()

	19 preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)

	20 if md_instance.safeMode != 'escape':

	21 preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)

	22 preprocessors["reference"] = ReferencePreprocessor(md_instance)

	23 return preprocessors

	24

	25

	26 class Preprocessor(util.Processor):

	27 """

	28 Preprocessors are run after the text is broken into lines.

	29

	30 Each preprocessor implements a "run" method that takes a pointer to a

	31 list of lines of the document, modifies it as necessary and returns

	32 either the same pointer or a pointer to a new list.

	33

	34 Preprocessors must extend markdown.Preprocessor.

	35

	36 """

	37 def run(self, lines):

	38 """

	39 Each subclass of Preprocessor should override the `run` method, which

	40 takes the document as a list of strings split by newlines and returns

	41 the (possibly modified) list of lines.

	42

	43 """

	44 pass # pragma: no cover

	45

	46

	47 class NormalizeWhitespace(Preprocessor):

	48 """ Normalize whitespace for consistant parsing. """

	49

	50 def run(self, lines):

	51 source = '\n'.join(lines)

	52 source = source.replace(util.STX, "").replace(util.ETX, "")

	53 source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"

	54 source = source.expandtabs(self.markdown.tab_length)

	55 source = re.sub(r'(?<=\n) +\n', '\n', source)

	56 return source.split('\n')

	57

	58

	59 class HtmlBlockPreprocessor(Preprocessor):

	60 """Remove html blocks from the text and store them for later retrieval."""

	61

	62 right_tag_patterns = ["</%s>", "%s>"]

	63 attrs_pattern = r"""

	64 \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"

	65 \| # OR

	66 \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value

	67 \| # OR

	68 \s+(?P<attr2>[^>"'/= ]+) # attr

	69 """

	70 left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s))\s\/?\>?' % \

	71 attrs_pattern

	72 attrs_re = re.compile(attrs_pattern, re.VERBOSE)

	73 left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)

	74 markdown_in_raw = False

	75

	76 def _get_left_tag(self, block):

	77 m = self.left_tag_re.match(block)

	78 if m:

	79 tag = m.group('tag')

	80 raw_attrs = m.group('attrs')

	81 attrs = {}

	82 if raw_attrs:

	83 for ma in self.attrs_re.finditer(raw_attrs):

	84 if ma.group('attr'):

	85 if ma.group('value'):

	86 attrs[ma.group('attr').strip()] = ma.group('value')

	87 else:

	88 attrs[ma.group('attr').strip()] = ""

	89 elif ma.group('attr1'):

	90 if ma.group('value1'):

	91 attrs[ma.group('attr1').strip()] = ma.group(

	92 'value1'

	93 )

	94 else:

	95 attrs[ma.group('attr1').strip()] = ""

	96 elif ma.group('attr2'):

	97 attrs[ma.group('attr2').strip()] = ""

	98 return tag, len(m.group(0)), attrs

	99 else:

	100 tag = block[1:].split(">", 1)[0].lower()

	101 return tag, len(tag)+2, {}

	102

	103 def _recursive_tagfind(self, ltag, rtag, start_index, block):

	104 while 1:

	105 i = block.find(rtag, start_index)

	106 if i == -1:

	107 return -1

	108 j = block.find(ltag, start_index)

	109 # if no ltag, or rtag found before another ltag, return index

	110 if (j > i or j == -1):

	111 return i + len(rtag)

	112 # another ltag found before rtag, use end of ltag as starting

	113 # point and search again

	114 j = block.find('>', j)

	115 start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)

	116 if start_index == -1:

	117 # HTML potentially malformed- ltag has no corresponding

	118 # rtag

	119 return -1

	120

	121 def _get_right_tag(self, left_tag, left_index, block):

	122 for p in self.right_tag_patterns:

	123 tag = p % left_tag

	124 i = self._recursive_tagfind(

	125 "<%s" % left_tag, tag, left_index, block

	126 )

	127 if i > 2:

	128 return tag.lstrip("<").rstrip(">"), i

	129 return block.rstrip()[-left_index:-1].lower(), len(block)

	130

	131 def _equal_tags(self, left_tag, right_tag):

	132 if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.

	133 return True

	134 if ("/" + left_tag) == right_tag:

	135 return True

	136 if (right_tag == "--" and left_tag == "--"):

	137 return True

	138 elif left_tag == right_tag[1:] and right_tag[0] == "/":

	139 return True

	140 else:

	141 return False

	142

	143 def _is_oneliner(self, tag):

	144 return (tag in ['hr', 'hr/'])

	145

	146 def _stringindex_to_listindex(self, stringindex, items):

	147 """

	148 Same effect as concatenating the strings in items,

	149 finding the character to which stringindex refers in that string,

	150 and returning the index of the item in which that character resides.

	151 """

	152 items.append('dummy')

	153 i, count = 0, 0

	154 while count <= stringindex:

	155 count += len(items[i])

	156 i += 1

	157 return i - 1

	158

	159 def _nested_markdown_in_html(self, items):

	160 """Find and process html child elements of the given element block."""

	161 for i, item in enumerate(items):

	162 if self.left_tag_re.match(item):

	163 left_tag, left_index, attrs = \

	164 self._get_left_tag(''.join(items[i:]))

	165 right_tag, data_index = self._get_right_tag(

	166 left_tag, left_index, ''.join(items[i:]))

	167 right_listindex = \

	168 self._stringindex_to_listindex(data_index, items[i:]) + i

	169 if 'markdown' in attrs.keys():

	170 items[i] = items[i][left_index:] # remove opening tag

	171 placeholder = self.markdown.htmlStash.store_tag(

	172 left_tag, attrs, i + 1, right_listindex + 1)

	173 items.insert(i, placeholder)

	174 if len(items) - right_listindex <= 1: # last nest, no tail

	175 right_listindex -= 1

	176 items[right_listindex] = items[right_listindex][

	177 :-len(right_tag) - 2] # remove closing tag

	178 else: # raw html

	179 if len(items) - right_listindex <= 1: # last element

	180 right_listindex -= 1

	181 offset = 1 if i == right_listindex else 0

	182 placeholder = self.markdown.htmlStash.store('\n\n'.join(

	183 items[i:right_listindex + offset]))

	184 del items[i:right_listindex + offset]

	185 items.insert(i, placeholder)

	186 return items

	187

	188 def run(self, lines):

	189 text = "\n".join(lines)

	190 new_blocks = []

	191 text = text.rsplit("\n\n")

	192 items = []

	193 left_tag = ''

	194 right_tag = ''

	195 in_tag = False # flag

	196

	197 while text:

	198 block = text[0]

	199 if block.startswith("\n"):

	200 block = block[1:]

	201 text = text[1:]

	202

	203 if block.startswith("\n"):

	204 block = block[1:]

	205

	206 if not in_tag:

	207 if block.startswith("<") and len(block.strip()) > 1:

	208

	209 if block[1:4] == "!--":

	210 # is a comment block

	211 left_tag, left_index, attrs = "--", 2, {}

	212 else:

	213 left_tag, left_index, attrs = self._get_left_tag(block)

	214 right_tag, data_index = self._get_right_tag(left_tag,

	215 left_index,

	216 block)

	217 # keep checking conditions below and maybe just append

	218

	219 if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'):

	220 text.insert(0, block[data_index:])

	221 block = block[:data_index]

	222

	223 if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?" , "@", "%"]):

	224 new_blocks.append(block)

	225 continue

	226

	227 if self._is_oneliner(left_tag):

	228 new_blocks.append(block.strip())

	229 continue

	230

	231 if block.rstrip().endswith(">") \

	232 and self._equal_tags(left_tag, right_tag):

	233 if self.markdown_in_raw and 'markdown' in attrs.keys():

	234 block = block[left_index:-len(right_tag) - 2]

	235 new_blocks.append(self.markdown.htmlStash.

	236 store_tag(left_tag, attrs, 0, 2))

	237 new_blocks.extend([block])

	238 else:

	239 new_blocks.append(

	240 self.markdown.htmlStash.store(block.strip()))

	241 continue

	242 else:

	243 # if is block level tag and is not complete

	244 if (not self._equal_tags(left_tag, right_tag)) and \

	245 (util.isBlockLevel(left_tag) or left_tag == "--"):

	246 items.append(block.strip())

	247 in_tag = True

	248 else:

	249 new_blocks.append(

	250 self.markdown.htmlStash.store(block.strip())

	251 )

	252 continue

	253

	254 else:

	255 new_blocks.append(block)

	256

	257 else:

	258 items.append(block)

	259

	260 right_tag, data_index = self._get_right_tag(left_tag, 0, block)

	261

	262 if self._equal_tags(left_tag, right_tag):

	263 # if find closing tag

	264

	265 if data_index < len(block):

	266 # we have more text after right_tag

	267 items[-1] = block[:data_index]

	268 text.insert(0, block[data_index:])

	269

	270 in_tag = False

	271 if self.markdown_in_raw and 'markdown' in attrs.keys():

	272 items[0] = items[0][left_index:]

	273 items[-1] = items[-1][:-len(right_tag) - 2]

	274 if items[len(items) - 1]: # not a newline/empty string

	275 right_index = len(items) + 3

	276 else:

	277 right_index = len(items) + 2

	278 new_blocks.append(self.markdown.htmlStash.store_tag(

	279 left_tag, attrs, 0, right_index))

	280 placeholderslen = len(self.markdown.htmlStash.tag_data)

	281 new_blocks.extend(

	282 self._nested_markdown_in_html(items))

	283 nests = len(self.markdown.htmlStash.tag_data) - \

	284 placeholderslen

	285 self.markdown.htmlStash.tag_data[-1 - nests][

	286 'right_index'] += nests - 2

	287 else:

	288 new_blocks.append(

	289 self.markdown.htmlStash.store('\n\n'.join(items)))

	290 items = []

	291

	292 if items:

	293 if self.markdown_in_raw and 'markdown' in attrs.keys():

	294 items[0] = items[0][left_index:]

	295 items[-1] = items[-1][:-len(right_tag) - 2]

	296 if items[len(items) - 1]: # not a newline/empty string

	297 right_index = len(items) + 3

	298 else:

	299 right_index = len(items) + 2

	300 new_blocks.append(

	301 self.markdown.htmlStash.store_tag(

	302 left_tag, attrs, 0, right_index))

	303 placeholderslen = len(self.markdown.htmlStash.tag_data)

	304 new_blocks.extend(self._nested_markdown_in_html(items))

	305 nests = len(self.markdown.htmlStash.tag_data) - placeholderslen

	306 self.markdown.htmlStash.tag_data[-1 - nests][

	307 'right_index'] += nests - 2

	308 else:

	309 new_blocks.append(

	310 self.markdown.htmlStash.store('\n\n'.join(items)))

	311 new_blocks.append('\n')

	312

	313 new_text = "\n\n".join(new_blocks)

	314 return new_text.split("\n")

	315

	316

	317 class ReferencePreprocessor(Preprocessor):

	318 """ Remove reference definitions from text and store for later use. """

	319

	320 TITLE = r'[ ](\"(.)\"\|\'(.)\'\|$(.)$)[ ]*'

	321 RE = re.compile(

	322 r'^[ ]{0,3}\[([^\]])\]:\s([^ ])[ ](%s)?$' % TITLE, re.DOTALL

	323 )

	324 TITLE_RE = re.compile(r'^%s$' % TITLE)

	325

	326 def run(self, lines):

	327 new_text = []

	328 while lines:

	329 line = lines.pop(0)

	330 m = self.RE.match(line)

	331 if m:

	332 id = m.group(1).strip().lower()

	333 link = m.group(2).lstrip('<').rstrip('>')

	334 t = m.group(5) or m.group(6) or m.group(7)

	335 if not t:

	336 # Check next line for title

	337 tm = self.TITLE_RE.match(lines[0])

	338 if tm:

	339 lines.pop(0)

	340 t = tm.group(2) or tm.group(3) or tm.group(4)

	341 self.markdown.references[id] = (link, t)

	342 else:

	343 new_text.append(line)

	344

	345 return new_text # + "\n"

OLD	NEW

« no previous file with comments | « third_party/Python-Markdown/markdown/postprocessors.py ('k') | third_party/Python-Markdown/markdown/serializers.py » ('j') | no next file with comments »