Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(25)

Unified Diff: third_party/markdown/preprocessors.py

Issue 133433002: Docserver: Support markdown for HTML content. Request thirdparty submission review. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: change the version of app & cron.yaml Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/markdown/postprocessors.py ('k') | third_party/markdown/serializers.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/markdown/preprocessors.py
diff --git a/third_party/markdown/preprocessors.py b/third_party/markdown/preprocessors.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f1cfe7777fbacde0ae2a68ac910e343426e9fb7
--- /dev/null
+++ b/third_party/markdown/preprocessors.py
@@ -0,0 +1,330 @@
+# markdown is released under the BSD license
+# Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
+# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+# Copyright 2004 Manfred Stienstra (the original version)
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the <organization> nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+"""
+PRE-PROCESSORS
+=============================================================================
+
+Preprocessors work on source text before we start doing anything too
+complicated.
+"""
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from . import util
+from . import odict
+import re
+
+
+def build_preprocessors(md_instance, **kwargs):
+ """ Build the default set of preprocessors used by Markdown. """
+ preprocessors = odict.OrderedDict()
+ preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
+ if md_instance.safeMode != 'escape':
+ preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
+ preprocessors["reference"] = ReferencePreprocessor(md_instance)
+ return preprocessors
+
+
+class Preprocessor(util.Processor):
+ """
+ Preprocessors are run after the text is broken into lines.
+
+ Each preprocessor implements a "run" method that takes a pointer to a
+ list of lines of the document, modifies it as necessary and returns
+ either the same pointer or a pointer to a new list.
+
+ Preprocessors must extend markdown.Preprocessor.
+
+ """
+ def run(self, lines):
+ """
+ Each subclass of Preprocessor should override the `run` method, which
+ takes the document as a list of strings split by newlines and returns
+ the (possibly modified) list of lines.
+
+ """
+ pass
+
+
+class NormalizeWhitespace(Preprocessor):
+ """ Normalize whitespace for consistant parsing. """
+
+ def run(self, lines):
+ source = '\n'.join(lines)
+ source = source.replace(util.STX, "").replace(util.ETX, "")
+ source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
+ source = source.expandtabs(self.markdown.tab_length)
+ source = re.sub(r'(?<=\n) +\n', '\n', source)
+ return source.split('\n')
+
+
+class HtmlBlockPreprocessor(Preprocessor):
+ """Remove html blocks from the text and store them for later retrieval."""
+
+ right_tag_patterns = ["</%s>", "%s>"]
+ attrs_pattern = r"""
+ \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"
+ | # OR
+ \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value
+ | # OR
+ \s+(?P<attr2>[^>"'/= ]+) # attr
+ """
+ left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern
+ attrs_re = re.compile(attrs_pattern, re.VERBOSE)
+ left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
+ markdown_in_raw = False
+
+ def _get_left_tag(self, block):
+ m = self.left_tag_re.match(block)
+ if m:
+ tag = m.group('tag')
+ raw_attrs = m.group('attrs')
+ attrs = {}
+ if raw_attrs:
+ for ma in self.attrs_re.finditer(raw_attrs):
+ if ma.group('attr'):
+ if ma.group('value'):
+ attrs[ma.group('attr').strip()] = ma.group('value')
+ else:
+ attrs[ma.group('attr').strip()] = ""
+ elif ma.group('attr1'):
+ if ma.group('value1'):
+ attrs[ma.group('attr1').strip()] = ma.group('value1')
+ else:
+ attrs[ma.group('attr1').strip()] = ""
+ elif ma.group('attr2'):
+ attrs[ma.group('attr2').strip()] = ""
+ return tag, len(m.group(0)), attrs
+ else:
+ tag = block[1:].split(">", 1)[0].lower()
+ return tag, len(tag)+2, {}
+
+ def _recursive_tagfind(self, ltag, rtag, start_index, block):
+ while 1:
+ i = block.find(rtag, start_index)
+ if i == -1:
+ return -1
+ j = block.find(ltag, start_index)
+ # if no ltag, or rtag found before another ltag, return index
+ if (j > i or j == -1):
+ return i + len(rtag)
+ # another ltag found before rtag, use end of ltag as starting
+ # point and search again
+ j = block.find('>', j)
+ start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
+ if start_index == -1:
+ # HTML potentially malformed- ltag has no corresponding
+ # rtag
+ return -1
+
+ def _get_right_tag(self, left_tag, left_index, block):
+ for p in self.right_tag_patterns:
+ tag = p % left_tag
+ i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)
+ if i > 2:
+ return tag.lstrip("<").rstrip(">"), i
+ return block.rstrip()[-left_index:-1].lower(), len(block)
+
+ def _equal_tags(self, left_tag, right_tag):
+ if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
+ return True
+ if ("/" + left_tag) == right_tag:
+ return True
+ if (right_tag == "--" and left_tag == "--"):
+ return True
+ elif left_tag == right_tag[1:] \
+ and right_tag[0] == "/":
+ return True
+ else:
+ return False
+
+ def _is_oneliner(self, tag):
+ return (tag in ['hr', 'hr/'])
+
+ def run(self, lines):
+ text = "\n".join(lines)
+ new_blocks = []
+ text = text.rsplit("\n\n")
+ items = []
+ left_tag = ''
+ right_tag = ''
+ in_tag = False # flag
+
+ while text:
+ block = text[0]
+ if block.startswith("\n"):
+ block = block[1:]
+ text = text[1:]
+
+ if block.startswith("\n"):
+ block = block[1:]
+
+ if not in_tag:
+ if block.startswith("<") and len(block.strip()) > 1:
+
+ if block[1] == "!":
+ # is a comment block
+ left_tag, left_index, attrs = "--", 2, {}
+ else:
+ left_tag, left_index, attrs = self._get_left_tag(block)
+ right_tag, data_index = self._get_right_tag(left_tag,
+ left_index,
+ block)
+ # keep checking conditions below and maybe just append
+
+ if data_index < len(block) \
+ and (util.isBlockLevel(left_tag)
+ or left_tag == '--'):
+ text.insert(0, block[data_index:])
+ block = block[:data_index]
+
+ if not (util.isBlockLevel(left_tag) \
+ or block[1] in ["!", "?", "@", "%"]):
+ new_blocks.append(block)
+ continue
+
+ if self._is_oneliner(left_tag):
+ new_blocks.append(block.strip())
+ continue
+
+ if block.rstrip().endswith(">") \
+ and self._equal_tags(left_tag, right_tag):
+ if self.markdown_in_raw and 'markdown' in attrs.keys():
+ start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
+ '', block[:left_index])
+ end = block[-len(right_tag)-2:]
+ block = block[left_index:-len(right_tag)-2]
+ new_blocks.append(
+ self.markdown.htmlStash.store(start))
+ new_blocks.append(block)
+ new_blocks.append(
+ self.markdown.htmlStash.store(end))
+ else:
+ new_blocks.append(
+ self.markdown.htmlStash.store(block.strip()))
+ continue
+ else:
+ # if is block level tag and is not complete
+
+ if util.isBlockLevel(left_tag) or left_tag == "--" \
+ and not block.rstrip().endswith(">"):
+ items.append(block.strip())
+ in_tag = True
+ else:
+ new_blocks.append(
+ self.markdown.htmlStash.store(block.strip()))
+
+ continue
+
+ new_blocks.append(block)
+
+ else:
+ items.append(block)
+
+ right_tag, data_index = self._get_right_tag(left_tag, 0, block)
+
+ if self._equal_tags(left_tag, right_tag):
+ # if find closing tag
+
+ if data_index < len(block):
+ # we have more text after right_tag
+ items[-1] = block[:data_index]
+ text.insert(0, block[data_index:])
+
+ in_tag = False
+ if self.markdown_in_raw and 'markdown' in attrs.keys():
+ start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
+ '', items[0][:left_index])
+ items[0] = items[0][left_index:]
+ end = items[-1][-len(right_tag)-2:]
+ items[-1] = items[-1][:-len(right_tag)-2]
+ new_blocks.append(
+ self.markdown.htmlStash.store(start))
+ new_blocks.extend(items)
+ new_blocks.append(
+ self.markdown.htmlStash.store(end))
+ else:
+ new_blocks.append(
+ self.markdown.htmlStash.store('\n\n'.join(items)))
+ items = []
+
+ if items:
+ if self.markdown_in_raw and 'markdown' in attrs.keys():
+ start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
+ '', items[0][:left_index])
+ items[0] = items[0][left_index:]
+ end = items[-1][-len(right_tag)-2:]
+ items[-1] = items[-1][:-len(right_tag)-2]
+ new_blocks.append(
+ self.markdown.htmlStash.store(start))
+ new_blocks.extend(items)
+ if end.strip():
+ new_blocks.append(
+ self.markdown.htmlStash.store(end))
+ else:
+ new_blocks.append(
+ self.markdown.htmlStash.store('\n\n'.join(items)))
+ #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
+ new_blocks.append('\n')
+
+ new_text = "\n\n".join(new_blocks)
+ return new_text.split("\n")
+
+
+class ReferencePreprocessor(Preprocessor):
+ """ Remove reference definitions from text and store for later use. """
+
+ TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
+ RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL)
+ TITLE_RE = re.compile(r'^%s$' % TITLE)
+
+ def run (self, lines):
+ new_text = [];
+ while lines:
+ line = lines.pop(0)
+ m = self.RE.match(line)
+ if m:
+ id = m.group(1).strip().lower()
+ link = m.group(2).lstrip('<').rstrip('>')
+ t = m.group(5) or m.group(6) or m.group(7)
+ if not t:
+ # Check next line for title
+ tm = self.TITLE_RE.match(lines[0])
+ if tm:
+ lines.pop(0)
+ t = tm.group(2) or tm.group(3) or tm.group(4)
+ self.markdown.references[id] = (link, t)
+ else:
+ new_text.append(line)
+
+ return new_text #+ "\n"
« no previous file with comments | « third_party/markdown/postprocessors.py ('k') | third_party/markdown/serializers.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698