Index: trunk/src/third_party/markdown/preprocessors.py |
=================================================================== |
--- trunk/src/third_party/markdown/preprocessors.py (revision 244009) |
+++ trunk/src/third_party/markdown/preprocessors.py (working copy) |
@@ -1,298 +0,0 @@ |
-""" |
-PRE-PROCESSORS |
-============================================================================= |
- |
-Preprocessors work on source text before we start doing anything too |
-complicated. |
-""" |
- |
-from __future__ import absolute_import |
-from __future__ import unicode_literals |
-from . import util |
-from . import odict |
-import re |
- |
- |
-def build_preprocessors(md_instance, **kwargs): |
- """ Build the default set of preprocessors used by Markdown. """ |
- preprocessors = odict.OrderedDict() |
- preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance) |
- if md_instance.safeMode != 'escape': |
- preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance) |
- preprocessors["reference"] = ReferencePreprocessor(md_instance) |
- return preprocessors |
- |
- |
-class Preprocessor(util.Processor): |
- """ |
- Preprocessors are run after the text is broken into lines. |
- |
- Each preprocessor implements a "run" method that takes a pointer to a |
- list of lines of the document, modifies it as necessary and returns |
- either the same pointer or a pointer to a new list. |
- |
- Preprocessors must extend markdown.Preprocessor. |
- |
- """ |
- def run(self, lines): |
- """ |
- Each subclass of Preprocessor should override the `run` method, which |
- takes the document as a list of strings split by newlines and returns |
- the (possibly modified) list of lines. |
- |
- """ |
- pass |
- |
- |
-class NormalizeWhitespace(Preprocessor): |
- """ Normalize whitespace for consistant parsing. """ |
- |
- def run(self, lines): |
- source = '\n'.join(lines) |
- source = source.replace(util.STX, "").replace(util.ETX, "") |
- source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" |
- source = source.expandtabs(self.markdown.tab_length) |
- source = re.sub(r'(?<=\n) +\n', '\n', source) |
- return source.split('\n') |
- |
- |
-class HtmlBlockPreprocessor(Preprocessor): |
- """Remove html blocks from the text and store them for later retrieval.""" |
- |
- right_tag_patterns = ["</%s>", "%s>"] |
- attrs_pattern = r""" |
- \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" |
- | # OR |
- \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value |
- | # OR |
- \s+(?P<attr2>[^>"'/= ]+) # attr |
- """ |
- left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern |
- attrs_re = re.compile(attrs_pattern, re.VERBOSE) |
- left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) |
- markdown_in_raw = False |
- |
- def _get_left_tag(self, block): |
- m = self.left_tag_re.match(block) |
- if m: |
- tag = m.group('tag') |
- raw_attrs = m.group('attrs') |
- attrs = {} |
- if raw_attrs: |
- for ma in self.attrs_re.finditer(raw_attrs): |
- if ma.group('attr'): |
- if ma.group('value'): |
- attrs[ma.group('attr').strip()] = ma.group('value') |
- else: |
- attrs[ma.group('attr').strip()] = "" |
- elif ma.group('attr1'): |
- if ma.group('value1'): |
- attrs[ma.group('attr1').strip()] = ma.group('value1') |
- else: |
- attrs[ma.group('attr1').strip()] = "" |
- elif ma.group('attr2'): |
- attrs[ma.group('attr2').strip()] = "" |
- return tag, len(m.group(0)), attrs |
- else: |
- tag = block[1:].split(">", 1)[0].lower() |
- return tag, len(tag)+2, {} |
- |
- def _recursive_tagfind(self, ltag, rtag, start_index, block): |
- while 1: |
- i = block.find(rtag, start_index) |
- if i == -1: |
- return -1 |
- j = block.find(ltag, start_index) |
- # if no ltag, or rtag found before another ltag, return index |
- if (j > i or j == -1): |
- return i + len(rtag) |
- # another ltag found before rtag, use end of ltag as starting |
- # point and search again |
- j = block.find('>', j) |
- start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) |
- if start_index == -1: |
- # HTML potentially malformed- ltag has no corresponding |
- # rtag |
- return -1 |
- |
- def _get_right_tag(self, left_tag, left_index, block): |
- for p in self.right_tag_patterns: |
- tag = p % left_tag |
- i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block) |
- if i > 2: |
- return tag.lstrip("<").rstrip(">"), i |
- return block.rstrip()[-left_index:-1].lower(), len(block) |
- |
- def _equal_tags(self, left_tag, right_tag): |
- if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. |
- return True |
- if ("/" + left_tag) == right_tag: |
- return True |
- if (right_tag == "--" and left_tag == "--"): |
- return True |
- elif left_tag == right_tag[1:] \ |
- and right_tag[0] == "/": |
- return True |
- else: |
- return False |
- |
- def _is_oneliner(self, tag): |
- return (tag in ['hr', 'hr/']) |
- |
- def run(self, lines): |
- text = "\n".join(lines) |
- new_blocks = [] |
- text = text.rsplit("\n\n") |
- items = [] |
- left_tag = '' |
- right_tag = '' |
- in_tag = False # flag |
- |
- while text: |
- block = text[0] |
- if block.startswith("\n"): |
- block = block[1:] |
- text = text[1:] |
- |
- if block.startswith("\n"): |
- block = block[1:] |
- |
- if not in_tag: |
- if block.startswith("<") and len(block.strip()) > 1: |
- |
- if block[1] == "!": |
- # is a comment block |
- left_tag, left_index, attrs = "--", 2, {} |
- else: |
- left_tag, left_index, attrs = self._get_left_tag(block) |
- right_tag, data_index = self._get_right_tag(left_tag, |
- left_index, |
- block) |
- # keep checking conditions below and maybe just append |
- |
- if data_index < len(block) \ |
- and (util.isBlockLevel(left_tag) |
- or left_tag == '--'): |
- text.insert(0, block[data_index:]) |
- block = block[:data_index] |
- |
- if not (util.isBlockLevel(left_tag) \ |
- or block[1] in ["!", "?", "@", "%"]): |
- new_blocks.append(block) |
- continue |
- |
- if self._is_oneliner(left_tag): |
- new_blocks.append(block.strip()) |
- continue |
- |
- if block.rstrip().endswith(">") \ |
- and self._equal_tags(left_tag, right_tag): |
- if self.markdown_in_raw and 'markdown' in attrs.keys(): |
- start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', |
- '', block[:left_index]) |
- end = block[-len(right_tag)-2:] |
- block = block[left_index:-len(right_tag)-2] |
- new_blocks.append( |
- self.markdown.htmlStash.store(start)) |
- new_blocks.append(block) |
- new_blocks.append( |
- self.markdown.htmlStash.store(end)) |
- else: |
- new_blocks.append( |
- self.markdown.htmlStash.store(block.strip())) |
- continue |
- else: |
- # if is block level tag and is not complete |
- |
- if util.isBlockLevel(left_tag) or left_tag == "--" \ |
- and not block.rstrip().endswith(">"): |
- items.append(block.strip()) |
- in_tag = True |
- else: |
- new_blocks.append( |
- self.markdown.htmlStash.store(block.strip())) |
- |
- continue |
- |
- new_blocks.append(block) |
- |
- else: |
- items.append(block) |
- |
- right_tag, data_index = self._get_right_tag(left_tag, 0, block) |
- |
- if self._equal_tags(left_tag, right_tag): |
- # if find closing tag |
- |
- if data_index < len(block): |
- # we have more text after right_tag |
- items[-1] = block[:data_index] |
- text.insert(0, block[data_index:]) |
- |
- in_tag = False |
- if self.markdown_in_raw and 'markdown' in attrs.keys(): |
- start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', |
- '', items[0][:left_index]) |
- items[0] = items[0][left_index:] |
- end = items[-1][-len(right_tag)-2:] |
- items[-1] = items[-1][:-len(right_tag)-2] |
- new_blocks.append( |
- self.markdown.htmlStash.store(start)) |
- new_blocks.extend(items) |
- new_blocks.append( |
- self.markdown.htmlStash.store(end)) |
- else: |
- new_blocks.append( |
- self.markdown.htmlStash.store('\n\n'.join(items))) |
- items = [] |
- |
- if items: |
- if self.markdown_in_raw and 'markdown' in attrs.keys(): |
- start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', |
- '', items[0][:left_index]) |
- items[0] = items[0][left_index:] |
- end = items[-1][-len(right_tag)-2:] |
- items[-1] = items[-1][:-len(right_tag)-2] |
- new_blocks.append( |
- self.markdown.htmlStash.store(start)) |
- new_blocks.extend(items) |
- if end.strip(): |
- new_blocks.append( |
- self.markdown.htmlStash.store(end)) |
- else: |
- new_blocks.append( |
- self.markdown.htmlStash.store('\n\n'.join(items))) |
- #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))) |
- new_blocks.append('\n') |
- |
- new_text = "\n\n".join(new_blocks) |
- return new_text.split("\n") |
- |
- |
-class ReferencePreprocessor(Preprocessor): |
- """ Remove reference definitions from text and store for later use. """ |
- |
- TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*' |
- RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL) |
- TITLE_RE = re.compile(r'^%s$' % TITLE) |
- |
- def run (self, lines): |
- new_text = []; |
- while lines: |
- line = lines.pop(0) |
- m = self.RE.match(line) |
- if m: |
- id = m.group(1).strip().lower() |
- link = m.group(2).lstrip('<').rstrip('>') |
- t = m.group(5) or m.group(6) or m.group(7) |
- if not t: |
- # Check next line for title |
- tm = self.TITLE_RE.match(lines[0]) |
- if tm: |
- lines.pop(0) |
- t = tm.group(2) or tm.group(3) or tm.group(4) |
- self.markdown.references[id] = (link, t) |
- else: |
- new_text.append(line) |
- |
- return new_text #+ "\n" |