Index: third_party/Python-Markdown/markdown/preprocessors.py |
diff --git a/third_party/Python-Markdown/markdown/preprocessors.py b/third_party/Python-Markdown/markdown/preprocessors.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..7fd38d331fb5685a4c06f23e646c9d4d40e69b8b |
--- /dev/null |
+++ b/third_party/Python-Markdown/markdown/preprocessors.py |
@@ -0,0 +1,345 @@ |
+""" |
+PRE-PROCESSORS |
+============================================================================= |
+ |
+Preprocessors work on source text before we start doing anything too |
+complicated. |
+""" |
+ |
+from __future__ import absolute_import |
+from __future__ import unicode_literals |
+from . import util |
+from . import odict |
+import re |
+ |
+ |
+def build_preprocessors(md_instance, **kwargs): |
+ """ Build the default set of preprocessors used by Markdown. """ |
+ preprocessors = odict.OrderedDict() |
+ preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance) |
+ if md_instance.safeMode != 'escape': |
+ preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance) |
+ preprocessors["reference"] = ReferencePreprocessor(md_instance) |
+ return preprocessors |
+ |
+ |
+class Preprocessor(util.Processor): |
+ """ |
+ Preprocessors are run after the text is broken into lines. |
+ |
+ Each preprocessor implements a "run" method that takes a pointer to a |
+ list of lines of the document, modifies it as necessary and returns |
+ either the same pointer or a pointer to a new list. |
+ |
+ Preprocessors must extend markdown.Preprocessor. |
+ |
+ """ |
+ def run(self, lines): |
+ """ |
+ Each subclass of Preprocessor should override the `run` method, which |
+ takes the document as a list of strings split by newlines and returns |
+ the (possibly modified) list of lines. |
+ |
+ """ |
+ pass # pragma: no cover |
+ |
+ |
+class NormalizeWhitespace(Preprocessor): |
+ """ Normalize whitespace for consistant parsing. """ |
+ |
+ def run(self, lines): |
+ source = '\n'.join(lines) |
+ source = source.replace(util.STX, "").replace(util.ETX, "") |
+ source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" |
+ source = source.expandtabs(self.markdown.tab_length) |
+ source = re.sub(r'(?<=\n) +\n', '\n', source) |
+ return source.split('\n') |
+ |
+ |
+class HtmlBlockPreprocessor(Preprocessor): |
+ """Remove html blocks from the text and store them for later retrieval.""" |
+ |
+ right_tag_patterns = ["</%s>", "%s>"] |
+ attrs_pattern = r""" |
+ \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" |
+ | # OR |
+ \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value |
+ | # OR |
+ \s+(?P<attr2>[^>"'/= ]+) # attr |
+ """ |
+ left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \ |
+ attrs_pattern |
+ attrs_re = re.compile(attrs_pattern, re.VERBOSE) |
+ left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) |
+ markdown_in_raw = False |
+ |
+ def _get_left_tag(self, block): |
+ m = self.left_tag_re.match(block) |
+ if m: |
+ tag = m.group('tag') |
+ raw_attrs = m.group('attrs') |
+ attrs = {} |
+ if raw_attrs: |
+ for ma in self.attrs_re.finditer(raw_attrs): |
+ if ma.group('attr'): |
+ if ma.group('value'): |
+ attrs[ma.group('attr').strip()] = ma.group('value') |
+ else: |
+ attrs[ma.group('attr').strip()] = "" |
+ elif ma.group('attr1'): |
+ if ma.group('value1'): |
+ attrs[ma.group('attr1').strip()] = ma.group( |
+ 'value1' |
+ ) |
+ else: |
+ attrs[ma.group('attr1').strip()] = "" |
+ elif ma.group('attr2'): |
+ attrs[ma.group('attr2').strip()] = "" |
+ return tag, len(m.group(0)), attrs |
+ else: |
+ tag = block[1:].split(">", 1)[0].lower() |
+ return tag, len(tag)+2, {} |
+ |
+ def _recursive_tagfind(self, ltag, rtag, start_index, block): |
+ while 1: |
+ i = block.find(rtag, start_index) |
+ if i == -1: |
+ return -1 |
+ j = block.find(ltag, start_index) |
+ # if no ltag, or rtag found before another ltag, return index |
+ if (j > i or j == -1): |
+ return i + len(rtag) |
+ # another ltag found before rtag, use end of ltag as starting |
+ # point and search again |
+ j = block.find('>', j) |
+ start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) |
+ if start_index == -1: |
+ # HTML potentially malformed- ltag has no corresponding |
+ # rtag |
+ return -1 |
+ |
+ def _get_right_tag(self, left_tag, left_index, block): |
+ for p in self.right_tag_patterns: |
+ tag = p % left_tag |
+ i = self._recursive_tagfind( |
+ "<%s" % left_tag, tag, left_index, block |
+ ) |
+ if i > 2: |
+ return tag.lstrip("<").rstrip(">"), i |
+ return block.rstrip()[-left_index:-1].lower(), len(block) |
+ |
+ def _equal_tags(self, left_tag, right_tag): |
+ if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. |
+ return True |
+ if ("/" + left_tag) == right_tag: |
+ return True |
+ if (right_tag == "--" and left_tag == "--"): |
+ return True |
+ elif left_tag == right_tag[1:] and right_tag[0] == "/": |
+ return True |
+ else: |
+ return False |
+ |
+ def _is_oneliner(self, tag): |
+ return (tag in ['hr', 'hr/']) |
+ |
+ def _stringindex_to_listindex(self, stringindex, items): |
+ """ |
+ Same effect as concatenating the strings in items, |
+ finding the character to which stringindex refers in that string, |
+ and returning the index of the item in which that character resides. |
+ """ |
+ items.append('dummy') |
+ i, count = 0, 0 |
+ while count <= stringindex: |
+ count += len(items[i]) |
+ i += 1 |
+ return i - 1 |
+ |
+ def _nested_markdown_in_html(self, items): |
+ """Find and process html child elements of the given element block.""" |
+ for i, item in enumerate(items): |
+ if self.left_tag_re.match(item): |
+ left_tag, left_index, attrs = \ |
+ self._get_left_tag(''.join(items[i:])) |
+ right_tag, data_index = self._get_right_tag( |
+ left_tag, left_index, ''.join(items[i:])) |
+ right_listindex = \ |
+ self._stringindex_to_listindex(data_index, items[i:]) + i |
+ if 'markdown' in attrs.keys(): |
+ items[i] = items[i][left_index:] # remove opening tag |
+ placeholder = self.markdown.htmlStash.store_tag( |
+ left_tag, attrs, i + 1, right_listindex + 1) |
+ items.insert(i, placeholder) |
+ if len(items) - right_listindex <= 1: # last nest, no tail |
+ right_listindex -= 1 |
+ items[right_listindex] = items[right_listindex][ |
+ :-len(right_tag) - 2] # remove closing tag |
+ else: # raw html |
+ if len(items) - right_listindex <= 1: # last element |
+ right_listindex -= 1 |
+ offset = 1 if i == right_listindex else 0 |
+ placeholder = self.markdown.htmlStash.store('\n\n'.join( |
+ items[i:right_listindex + offset])) |
+ del items[i:right_listindex + offset] |
+ items.insert(i, placeholder) |
+ return items |
+ |
+ def run(self, lines): |
+ text = "\n".join(lines) |
+ new_blocks = [] |
+ text = text.rsplit("\n\n") |
+ items = [] |
+ left_tag = '' |
+ right_tag = '' |
+ in_tag = False # flag |
+ |
+ while text: |
+ block = text[0] |
+ if block.startswith("\n"): |
+ block = block[1:] |
+ text = text[1:] |
+ |
+ if block.startswith("\n"): |
+ block = block[1:] |
+ |
+ if not in_tag: |
+ if block.startswith("<") and len(block.strip()) > 1: |
+ |
+ if block[1:4] == "!--": |
+ # is a comment block |
+ left_tag, left_index, attrs = "--", 2, {} |
+ else: |
+ left_tag, left_index, attrs = self._get_left_tag(block) |
+ right_tag, data_index = self._get_right_tag(left_tag, |
+ left_index, |
+ block) |
+ # keep checking conditions below and maybe just append |
+ |
+ if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'): |
+ text.insert(0, block[data_index:]) |
+ block = block[:data_index] |
+ |
+ if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?", "@", "%"]): |
+ new_blocks.append(block) |
+ continue |
+ |
+ if self._is_oneliner(left_tag): |
+ new_blocks.append(block.strip()) |
+ continue |
+ |
+ if block.rstrip().endswith(">") \ |
+ and self._equal_tags(left_tag, right_tag): |
+ if self.markdown_in_raw and 'markdown' in attrs.keys(): |
+ block = block[left_index:-len(right_tag) - 2] |
+ new_blocks.append(self.markdown.htmlStash. |
+ store_tag(left_tag, attrs, 0, 2)) |
+ new_blocks.extend([block]) |
+ else: |
+ new_blocks.append( |
+ self.markdown.htmlStash.store(block.strip())) |
+ continue |
+ else: |
+ # if is block level tag and is not complete |
+ if (not self._equal_tags(left_tag, right_tag)) and \ |
+ (util.isBlockLevel(left_tag) or left_tag == "--"): |
+ items.append(block.strip()) |
+ in_tag = True |
+ else: |
+ new_blocks.append( |
+ self.markdown.htmlStash.store(block.strip()) |
+ ) |
+ continue |
+ |
+ else: |
+ new_blocks.append(block) |
+ |
+ else: |
+ items.append(block) |
+ |
+ right_tag, data_index = self._get_right_tag(left_tag, 0, block) |
+ |
+ if self._equal_tags(left_tag, right_tag): |
+ # if find closing tag |
+ |
+ if data_index < len(block): |
+ # we have more text after right_tag |
+ items[-1] = block[:data_index] |
+ text.insert(0, block[data_index:]) |
+ |
+ in_tag = False |
+ if self.markdown_in_raw and 'markdown' in attrs.keys(): |
+ items[0] = items[0][left_index:] |
+ items[-1] = items[-1][:-len(right_tag) - 2] |
+ if items[len(items) - 1]: # not a newline/empty string |
+ right_index = len(items) + 3 |
+ else: |
+ right_index = len(items) + 2 |
+ new_blocks.append(self.markdown.htmlStash.store_tag( |
+ left_tag, attrs, 0, right_index)) |
+ placeholderslen = len(self.markdown.htmlStash.tag_data) |
+ new_blocks.extend( |
+ self._nested_markdown_in_html(items)) |
+ nests = len(self.markdown.htmlStash.tag_data) - \ |
+ placeholderslen |
+ self.markdown.htmlStash.tag_data[-1 - nests][ |
+ 'right_index'] += nests - 2 |
+ else: |
+ new_blocks.append( |
+ self.markdown.htmlStash.store('\n\n'.join(items))) |
+ items = [] |
+ |
+ if items: |
+ if self.markdown_in_raw and 'markdown' in attrs.keys(): |
+ items[0] = items[0][left_index:] |
+ items[-1] = items[-1][:-len(right_tag) - 2] |
+ if items[len(items) - 1]: # not a newline/empty string |
+ right_index = len(items) + 3 |
+ else: |
+ right_index = len(items) + 2 |
+ new_blocks.append( |
+ self.markdown.htmlStash.store_tag( |
+ left_tag, attrs, 0, right_index)) |
+ placeholderslen = len(self.markdown.htmlStash.tag_data) |
+ new_blocks.extend(self._nested_markdown_in_html(items)) |
+ nests = len(self.markdown.htmlStash.tag_data) - placeholderslen |
+ self.markdown.htmlStash.tag_data[-1 - nests][ |
+ 'right_index'] += nests - 2 |
+ else: |
+ new_blocks.append( |
+ self.markdown.htmlStash.store('\n\n'.join(items))) |
+ new_blocks.append('\n') |
+ |
+ new_text = "\n\n".join(new_blocks) |
+ return new_text.split("\n") |
+ |
+ |
+class ReferencePreprocessor(Preprocessor): |
+ """ Remove reference definitions from text and store for later use. """ |
+ |
+ TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*' |
+ RE = re.compile( |
+ r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL |
+ ) |
+ TITLE_RE = re.compile(r'^%s$' % TITLE) |
+ |
+ def run(self, lines): |
+ new_text = [] |
+ while lines: |
+ line = lines.pop(0) |
+ m = self.RE.match(line) |
+ if m: |
+ id = m.group(1).strip().lower() |
+ link = m.group(2).lstrip('<').rstrip('>') |
+ t = m.group(5) or m.group(6) or m.group(7) |
+ if not t: |
+ # Check next line for title |
+ tm = self.TITLE_RE.match(lines[0]) |
+ if tm: |
+ lines.pop(0) |
+ t = tm.group(2) or tm.group(3) or tm.group(4) |
+ self.markdown.references[id] = (link, t) |
+ else: |
+ new_text.append(line) |
+ |
+ return new_text # + "\n" |