third_party/markdown/preprocessors.py - Issue 133433002: Docserver: Support markdown for HTML content. Request thirdparty submission review.

Side by Side Diff: third_party/markdown/preprocessors.py

Issue 133433002: Docserver: Support markdown for HTML content. Request thirdparty submission review. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: change the version of app & cron.yaml Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 # markdown is released under the BSD license

	2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)

	3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)

	4 # Copyright 2004 Manfred Stienstra (the original version)

	5 #

	6 # All rights reserved.

	7 #

	8 # Redistribution and use in source and binary forms, with or without

	9 # modification, are permitted provided that the following conditions are met:

	10 #

	11 # * Redistributions of source code must retain the above copyright

	12 # notice, this list of conditions and the following disclaimer.

	13 # * Redistributions in binary form must reproduce the above copyright

	14 # notice, this list of conditions and the following disclaimer in the

	15 # documentation and/or other materials provided with the distribution.

	16 # * Neither the name of the <organization> nor the

	17 # names of its contributors may be used to endorse or promote products

	18 # derived from this software without specific prior written permission.

	19 #

	20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY

	21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

	22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

	23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT

	24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

	25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

	26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

	27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

	28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

	29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

	30 # POSSIBILITY OF SUCH DAMAGE.

	31

	32

	33 """

	34 PRE-PROCESSORS

	35 =============================================================================

	36

	37 Preprocessors work on source text before we start doing anything too

	38 complicated.

	39 """

	40

	41 from __future__ import absolute_import

	42 from __future__ import unicode_literals

	43 from . import util

	44 from . import odict

	45 import re

	46

	47

	48 def build_preprocessors(md_instance, **kwargs):

	49 """ Build the default set of preprocessors used by Markdown. """

	50 preprocessors = odict.OrderedDict()

	51 preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)

	52 if md_instance.safeMode != 'escape':

	53 preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)

	54 preprocessors["reference"] = ReferencePreprocessor(md_instance)

	55 return preprocessors

	56

	57

	58 class Preprocessor(util.Processor):

	59 """

	60 Preprocessors are run after the text is broken into lines.

	61

	62 Each preprocessor implements a "run" method that takes a pointer to a

	63 list of lines of the document, modifies it as necessary and returns

	64 either the same pointer or a pointer to a new list.

	65

	66 Preprocessors must extend markdown.Preprocessor.

	67

	68 """

	69 def run(self, lines):

	70 """

	71 Each subclass of Preprocessor should override the `run` method, which

	72 takes the document as a list of strings split by newlines and returns

	73 the (possibly modified) list of lines.

	74

	75 """

	76 pass

	77

	78

	79 class NormalizeWhitespace(Preprocessor):

	80 """ Normalize whitespace for consistant parsing. """

	81

	82 def run(self, lines):

	83 source = '\n'.join(lines)

	84 source = source.replace(util.STX, "").replace(util.ETX, "")

	85 source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"

	86 source = source.expandtabs(self.markdown.tab_length)

	87 source = re.sub(r'(?<=\n) +\n', '\n', source)

	88 return source.split('\n')

	89

	90

	91 class HtmlBlockPreprocessor(Preprocessor):

	92 """Remove html blocks from the text and store them for later retrieval."""

	93

	94 right_tag_patterns = ["</%s>", "%s>"]

	95 attrs_pattern = r"""

	96 \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"

	97 \| # OR

	98 \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value

	99 \| # OR

	100 \s+(?P<attr2>[^>"'/= ]+) # attr

	101 """

	102 left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s))\s\/?\>?' % attrs_pa ttern

	103 attrs_re = re.compile(attrs_pattern, re.VERBOSE)

	104 left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)

	105 markdown_in_raw = False

	106

	107 def _get_left_tag(self, block):

	108 m = self.left_tag_re.match(block)

	109 if m:

	110 tag = m.group('tag')

	111 raw_attrs = m.group('attrs')

	112 attrs = {}

	113 if raw_attrs:

	114 for ma in self.attrs_re.finditer(raw_attrs):

	115 if ma.group('attr'):

	116 if ma.group('value'):

	117 attrs[ma.group('attr').strip()] = ma.group('value')

	118 else:

	119 attrs[ma.group('attr').strip()] = ""

	120 elif ma.group('attr1'):

	121 if ma.group('value1'):

	122 attrs[ma.group('attr1').strip()] = ma.group('value1' )

	123 else:

	124 attrs[ma.group('attr1').strip()] = ""

	125 elif ma.group('attr2'):

	126 attrs[ma.group('attr2').strip()] = ""

	127 return tag, len(m.group(0)), attrs

	128 else:

	129 tag = block[1:].split(">", 1)[0].lower()

	130 return tag, len(tag)+2, {}

	131

	132 def _recursive_tagfind(self, ltag, rtag, start_index, block):

	133 while 1:

	134 i = block.find(rtag, start_index)

	135 if i == -1:

	136 return -1

	137 j = block.find(ltag, start_index)

	138 # if no ltag, or rtag found before another ltag, return index

	139 if (j > i or j == -1):

	140 return i + len(rtag)

	141 # another ltag found before rtag, use end of ltag as starting

	142 # point and search again

	143 j = block.find('>', j)

	144 start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)

	145 if start_index == -1:

	146 # HTML potentially malformed- ltag has no corresponding

	147 # rtag

	148 return -1

	149

	150 def _get_right_tag(self, left_tag, left_index, block):

	151 for p in self.right_tag_patterns:

	152 tag = p % left_tag

	153 i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block )

	154 if i > 2:

	155 return tag.lstrip("<").rstrip(">"), i

	156 return block.rstrip()[-left_index:-1].lower(), len(block)

	157

	158 def _equal_tags(self, left_tag, right_tag):

	159 if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.

	160 return True

	161 if ("/" + left_tag) == right_tag:

	162 return True

	163 if (right_tag == "--" and left_tag == "--"):

	164 return True

	165 elif left_tag == right_tag[1:] \

	166 and right_tag[0] == "/":

	167 return True

	168 else:

	169 return False

	170

	171 def _is_oneliner(self, tag):

	172 return (tag in ['hr', 'hr/'])

	173

	174 def run(self, lines):

	175 text = "\n".join(lines)

	176 new_blocks = []

	177 text = text.rsplit("\n\n")

	178 items = []

	179 left_tag = ''

	180 right_tag = ''

	181 in_tag = False # flag

	182

	183 while text:

	184 block = text[0]

	185 if block.startswith("\n"):

	186 block = block[1:]

	187 text = text[1:]

	188

	189 if block.startswith("\n"):

	190 block = block[1:]

	191

	192 if not in_tag:

	193 if block.startswith("<") and len(block.strip()) > 1:

	194

	195 if block[1] == "!":

	196 # is a comment block

	197 left_tag, left_index, attrs = "--", 2, {}

	198 else:

	199 left_tag, left_index, attrs = self._get_left_tag(block)

	200 right_tag, data_index = self._get_right_tag(left_tag,

	201 left_index,

	202 block)

	203 # keep checking conditions below and maybe just append

	204

	205 if data_index < len(block) \

	206 and (util.isBlockLevel(left_tag)

	207 or left_tag == '--'):

	208 text.insert(0, block[data_index:])

	209 block = block[:data_index]

	210

	211 if not (util.isBlockLevel(left_tag) \

	212 or block[1] in ["!", "?", "@", "%"]):

	213 new_blocks.append(block)

	214 continue

	215

	216 if self._is_oneliner(left_tag):

	217 new_blocks.append(block.strip())

	218 continue

	219

	220 if block.rstrip().endswith(">") \

	221 and self._equal_tags(left_tag, right_tag):

	222 if self.markdown_in_raw and 'markdown' in attrs.keys():

	223 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',

	224 '', block[:left_index])

	225 end = block[-len(right_tag)-2:]

	226 block = block[left_index:-len(right_tag)-2]

	227 new_blocks.append(

	228 self.markdown.htmlStash.store(start))

	229 new_blocks.append(block)

	230 new_blocks.append(

	231 self.markdown.htmlStash.store(end))

	232 else:

	233 new_blocks.append(

	234 self.markdown.htmlStash.store(block.strip()))

	235 continue

	236 else:

	237 # if is block level tag and is not complete

	238

	239 if util.isBlockLevel(left_tag) or left_tag == "--" \

	240 and not block.rstrip().endswith(">"):

	241 items.append(block.strip())

	242 in_tag = True

	243 else:

	244 new_blocks.append(

	245 self.markdown.htmlStash.store(block.strip()))

	246

	247 continue

	248

	249 new_blocks.append(block)

	250

	251 else:

	252 items.append(block)

	253

	254 right_tag, data_index = self._get_right_tag(left_tag, 0, block)

	255

	256 if self._equal_tags(left_tag, right_tag):

	257 # if find closing tag

	258

	259 if data_index < len(block):

	260 # we have more text after right_tag

	261 items[-1] = block[:data_index]

	262 text.insert(0, block[data_index:])

	263

	264 in_tag = False

	265 if self.markdown_in_raw and 'markdown' in attrs.keys():

	266 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',

	267 '', items[0][:left_index])

	268 items[0] = items[0][left_index:]

	269 end = items[-1][-len(right_tag)-2:]

	270 items[-1] = items[-1][:-len(right_tag)-2]

	271 new_blocks.append(

	272 self.markdown.htmlStash.store(start))

	273 new_blocks.extend(items)

	274 new_blocks.append(

	275 self.markdown.htmlStash.store(end))

	276 else:

	277 new_blocks.append(

	278 self.markdown.htmlStash.store('\n\n'.join(items)))

	279 items = []

	280

	281 if items:

	282 if self.markdown_in_raw and 'markdown' in attrs.keys():

	283 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',

	284 '', items[0][:left_index])

	285 items[0] = items[0][left_index:]

	286 end = items[-1][-len(right_tag)-2:]

	287 items[-1] = items[-1][:-len(right_tag)-2]

	288 new_blocks.append(

	289 self.markdown.htmlStash.store(start))

	290 new_blocks.extend(items)

	291 if end.strip():

	292 new_blocks.append(

	293 self.markdown.htmlStash.store(end))

	294 else:

	295 new_blocks.append(

	296 self.markdown.htmlStash.store('\n\n'.join(items)))

	297 #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)) )

	298 new_blocks.append('\n')

	299

	300 new_text = "\n\n".join(new_blocks)

	301 return new_text.split("\n")

	302

	303

	304 class ReferencePreprocessor(Preprocessor):

	305 """ Remove reference definitions from text and store for later use. """

	306

	307 TITLE = r'[ ](\"(.)\"\|\'(.)\'\|$(.)$)[ ]*'

	308 RE = re.compile(r'^[ ]{0,3}\[([^\]])\]:\s([^ ])[ ](%s)?$' % TITLE, re.DO TALL)

	309 TITLE_RE = re.compile(r'^%s$' % TITLE)

	310

	311 def run (self, lines):

	312 new_text = [];

	313 while lines:

	314 line = lines.pop(0)

	315 m = self.RE.match(line)

	316 if m:

	317 id = m.group(1).strip().lower()

	318 link = m.group(2).lstrip('<').rstrip('>')

	319 t = m.group(5) or m.group(6) or m.group(7)

	320 if not t:

	321 # Check next line for title

	322 tm = self.TITLE_RE.match(lines[0])

	323 if tm:

	324 lines.pop(0)

	325 t = tm.group(2) or tm.group(3) or tm.group(4)

	326 self.markdown.references[id] = (link, t)

	327 else:

	328 new_text.append(line)

	329

	330 return new_text #+ "\n"

OLD	NEW

« no previous file with comments | « third_party/markdown/postprocessors.py ('k') | third_party/markdown/serializers.py » ('j') | no next file with comments »