Index: third_party/Python-Markdown/markdown/blockprocessors.py |
diff --git a/third_party/Python-Markdown/markdown/blockprocessors.py b/third_party/Python-Markdown/markdown/blockprocessors.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..29db022cee111b062818853bebaf99d6ffa1dcba |
--- /dev/null |
+++ b/third_party/Python-Markdown/markdown/blockprocessors.py |
@@ -0,0 +1,563 @@ |
+""" |
+CORE MARKDOWN BLOCKPARSER |
+=========================================================================== |
+ |
+This parser handles basic parsing of Markdown blocks. It doesn't concern |
+itself with inline elements such as **bold** or *italics*, but rather just |
+catches blocks, lists, quotes, etc. |
+ |
+The BlockParser is made up of a bunch of BlockProssors, each handling a |
+different type of block. Extensions may add/replace/remove BlockProcessors |
+as they need to alter how markdown blocks are parsed. |
+""" |
+ |
+from __future__ import absolute_import |
+from __future__ import division |
+from __future__ import unicode_literals |
+import logging |
+import re |
+from . import util |
+from .blockparser import BlockParser |
+ |
+logger = logging.getLogger('MARKDOWN') |
+ |
+ |
+def build_block_parser(md_instance, **kwargs): |
+ """ Build the default block parser used by Markdown. """ |
+ parser = BlockParser(md_instance) |
+ parser.blockprocessors['empty'] = EmptyBlockProcessor(parser) |
+ parser.blockprocessors['indent'] = ListIndentProcessor(parser) |
+ parser.blockprocessors['code'] = CodeBlockProcessor(parser) |
+ parser.blockprocessors['hashheader'] = HashHeaderProcessor(parser) |
+ parser.blockprocessors['setextheader'] = SetextHeaderProcessor(parser) |
+ parser.blockprocessors['hr'] = HRProcessor(parser) |
+ parser.blockprocessors['olist'] = OListProcessor(parser) |
+ parser.blockprocessors['ulist'] = UListProcessor(parser) |
+ parser.blockprocessors['quote'] = BlockQuoteProcessor(parser) |
+ parser.blockprocessors['paragraph'] = ParagraphProcessor(parser) |
+ return parser |
+ |
+ |
+class BlockProcessor: |
+ """ Base class for block processors. |
+ |
+ Each subclass will provide the methods below to work with the source and |
+ tree. Each processor will need to define it's own ``test`` and ``run`` |
+ methods. The ``test`` method should return True or False, to indicate |
+ whether the current block should be processed by this processor. If the |
+ test passes, the parser will call the processors ``run`` method. |
+ |
+ """ |
+ |
+ def __init__(self, parser): |
+ self.parser = parser |
+ self.tab_length = parser.markdown.tab_length |
+ |
+ def lastChild(self, parent): |
+ """ Return the last child of an etree element. """ |
+ if len(parent): |
+ return parent[-1] |
+ else: |
+ return None |
+ |
+ def detab(self, text): |
+ """ Remove a tab from the front of each line of the given text. """ |
+ newtext = [] |
+ lines = text.split('\n') |
+ for line in lines: |
+ if line.startswith(' '*self.tab_length): |
+ newtext.append(line[self.tab_length:]) |
+ elif not line.strip(): |
+ newtext.append('') |
+ else: |
+ break |
+ return '\n'.join(newtext), '\n'.join(lines[len(newtext):]) |
+ |
+ def looseDetab(self, text, level=1): |
+ """ Remove a tab from front of lines but allowing dedented lines. """ |
+ lines = text.split('\n') |
+ for i in range(len(lines)): |
+ if lines[i].startswith(' '*self.tab_length*level): |
+ lines[i] = lines[i][self.tab_length*level:] |
+ return '\n'.join(lines) |
+ |
+ def test(self, parent, block): |
+ """ Test for block type. Must be overridden by subclasses. |
+ |
+ As the parser loops through processors, it will call the ``test`` |
+ method on each to determine if the given block of text is of that |
+ type. This method must return a boolean ``True`` or ``False``. The |
+ actual method of testing is left to the needs of that particular |
+ block type. It could be as simple as ``block.startswith(some_string)`` |
+ or a complex regular expression. As the block type may be different |
+ depending on the parent of the block (i.e. inside a list), the parent |
+ etree element is also provided and may be used as part of the test. |
+ |
+ Keywords: |
+ |
+ * ``parent``: A etree element which will be the parent of the block. |
+ * ``block``: A block of text from the source which has been split at |
+ blank lines. |
+ """ |
+ pass # pragma: no cover |
+ |
+ def run(self, parent, blocks): |
+ """ Run processor. Must be overridden by subclasses. |
+ |
+ When the parser determines the appropriate type of a block, the parser |
+ will call the corresponding processor's ``run`` method. This method |
+ should parse the individual lines of the block and append them to |
+ the etree. |
+ |
+ Note that both the ``parent`` and ``etree`` keywords are pointers |
+ to instances of the objects which should be edited in place. Each |
+ processor must make changes to the existing objects as there is no |
+ mechanism to return new/different objects to replace them. |
+ |
+ This means that this method should be adding SubElements or adding text |
+ to the parent, and should remove (``pop``) or add (``insert``) items to |
+ the list of blocks. |
+ |
+ Keywords: |
+ |
+ * ``parent``: A etree element which is the parent of the current block. |
+ * ``blocks``: A list of all remaining blocks of the document. |
+ """ |
+ pass # pragma: no cover |
+ |
+ |
+class ListIndentProcessor(BlockProcessor): |
+ """ Process children of list items. |
+ |
+ Example: |
+ * a list item |
+ process this part |
+ |
+ or this part |
+ |
+ """ |
+ |
+ ITEM_TYPES = ['li'] |
+ LIST_TYPES = ['ul', 'ol'] |
+ |
+ def __init__(self, *args): |
+ BlockProcessor.__init__(self, *args) |
+ self.INDENT_RE = re.compile(r'^(([ ]{%s})+)' % self.tab_length) |
+ |
+ def test(self, parent, block): |
+ return block.startswith(' '*self.tab_length) and \ |
+ not self.parser.state.isstate('detabbed') and \ |
+ (parent.tag in self.ITEM_TYPES or |
+ (len(parent) and parent[-1] is not None and |
+ (parent[-1].tag in self.LIST_TYPES))) |
+ |
+ def run(self, parent, blocks): |
+ block = blocks.pop(0) |
+ level, sibling = self.get_level(parent, block) |
+ block = self.looseDetab(block, level) |
+ |
+ self.parser.state.set('detabbed') |
+ if parent.tag in self.ITEM_TYPES: |
+ # It's possible that this parent has a 'ul' or 'ol' child list |
+ # with a member. If that is the case, then that should be the |
+ # parent. This is intended to catch the edge case of an indented |
+ # list whose first member was parsed previous to this point |
+ # see OListProcessor |
+ if len(parent) and parent[-1].tag in self.LIST_TYPES: |
+ self.parser.parseBlocks(parent[-1], [block]) |
+ else: |
+ # The parent is already a li. Just parse the child block. |
+ self.parser.parseBlocks(parent, [block]) |
+ elif sibling.tag in self.ITEM_TYPES: |
+ # The sibling is a li. Use it as parent. |
+ self.parser.parseBlocks(sibling, [block]) |
+ elif len(sibling) and sibling[-1].tag in self.ITEM_TYPES: |
+ # The parent is a list (``ol`` or ``ul``) which has children. |
+ # Assume the last child li is the parent of this block. |
+ if sibling[-1].text: |
+ # If the parent li has text, that text needs to be moved to a p |
+ # The p must be 'inserted' at beginning of list in the event |
+ # that other children already exist i.e.; a nested sublist. |
+ p = util.etree.Element('p') |
+ p.text = sibling[-1].text |
+ sibling[-1].text = '' |
+ sibling[-1].insert(0, p) |
+ self.parser.parseChunk(sibling[-1], block) |
+ else: |
+ self.create_item(sibling, block) |
+ self.parser.state.reset() |
+ |
+ def create_item(self, parent, block): |
+ """ Create a new li and parse the block with it as the parent. """ |
+ li = util.etree.SubElement(parent, 'li') |
+ self.parser.parseBlocks(li, [block]) |
+ |
+ def get_level(self, parent, block): |
+ """ Get level of indent based on list level. """ |
+ # Get indent level |
+ m = self.INDENT_RE.match(block) |
+ if m: |
+ indent_level = len(m.group(1))/self.tab_length |
+ else: |
+ indent_level = 0 |
+ if self.parser.state.isstate('list'): |
+ # We're in a tightlist - so we already are at correct parent. |
+ level = 1 |
+ else: |
+ # We're in a looselist - so we need to find parent. |
+ level = 0 |
+ # Step through children of tree to find matching indent level. |
+ while indent_level > level: |
+ child = self.lastChild(parent) |
+ if (child is not None and |
+ (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES)): |
+ if child.tag in self.LIST_TYPES: |
+ level += 1 |
+ parent = child |
+ else: |
+ # No more child levels. If we're short of indent_level, |
+ # we have a code block. So we stop here. |
+ break |
+ return level, parent |
+ |
+ |
+class CodeBlockProcessor(BlockProcessor): |
+ """ Process code blocks. """ |
+ |
+ def test(self, parent, block): |
+ return block.startswith(' '*self.tab_length) |
+ |
+ def run(self, parent, blocks): |
+ sibling = self.lastChild(parent) |
+ block = blocks.pop(0) |
+ theRest = '' |
+ if (sibling is not None and sibling.tag == "pre" and |
+ len(sibling) and sibling[0].tag == "code"): |
+ # The previous block was a code block. As blank lines do not start |
+ # new code blocks, append this block to the previous, adding back |
+ # linebreaks removed from the split into a list. |
+ code = sibling[0] |
+ block, theRest = self.detab(block) |
+ code.text = util.AtomicString( |
+ '%s\n%s\n' % (code.text, block.rstrip()) |
+ ) |
+ else: |
+ # This is a new codeblock. Create the elements and insert text. |
+ pre = util.etree.SubElement(parent, 'pre') |
+ code = util.etree.SubElement(pre, 'code') |
+ block, theRest = self.detab(block) |
+ code.text = util.AtomicString('%s\n' % block.rstrip()) |
+ if theRest: |
+ # This block contained unindented line(s) after the first indented |
+ # line. Insert these lines as the first block of the master blocks |
+ # list for future processing. |
+ blocks.insert(0, theRest) |
+ |
+ |
+class BlockQuoteProcessor(BlockProcessor): |
+ |
+ RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)') |
+ |
+ def test(self, parent, block): |
+ return bool(self.RE.search(block)) |
+ |
+ def run(self, parent, blocks): |
+ block = blocks.pop(0) |
+ m = self.RE.search(block) |
+ if m: |
+ before = block[:m.start()] # Lines before blockquote |
+ # Pass lines before blockquote in recursively for parsing forst. |
+ self.parser.parseBlocks(parent, [before]) |
+ # Remove ``> `` from begining of each line. |
+ block = '\n'.join( |
+ [self.clean(line) for line in block[m.start():].split('\n')] |
+ ) |
+ sibling = self.lastChild(parent) |
+ if sibling is not None and sibling.tag == "blockquote": |
+ # Previous block was a blockquote so set that as this blocks parent |
+ quote = sibling |
+ else: |
+ # This is a new blockquote. Create a new parent element. |
+ quote = util.etree.SubElement(parent, 'blockquote') |
+ # Recursively parse block with blockquote as parent. |
+ # change parser state so blockquotes embedded in lists use p tags |
+ self.parser.state.set('blockquote') |
+ self.parser.parseChunk(quote, block) |
+ self.parser.state.reset() |
+ |
+ def clean(self, line): |
+ """ Remove ``>`` from beginning of a line. """ |
+ m = self.RE.match(line) |
+ if line.strip() == ">": |
+ return "" |
+ elif m: |
+ return m.group(2) |
+ else: |
+ return line |
+ |
+ |
+class OListProcessor(BlockProcessor): |
+ """ Process ordered list blocks. """ |
+ |
+ TAG = 'ol' |
+ # Detect an item (``1. item``). ``group(1)`` contains contents of item. |
+ RE = re.compile(r'^[ ]{0,3}\d+\.[ ]+(.*)') |
+ # Detect items on secondary lines. they can be of either list type. |
+ CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ]+(.*)') |
+ # Detect indented (nested) items of either type |
+ INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ]+.*') |
+ # The integer (python string) with which the lists starts (default=1) |
+ # Eg: If list is intialized as) |
+ # 3. Item |
+ # The ol tag will get starts="3" attribute |
+ STARTSWITH = '1' |
+ # List of allowed sibling tags. |
+ SIBLING_TAGS = ['ol', 'ul'] |
+ |
+ def test(self, parent, block): |
+ return bool(self.RE.match(block)) |
+ |
+ def run(self, parent, blocks): |
+ # Check fr multiple items in one block. |
+ items = self.get_items(blocks.pop(0)) |
+ sibling = self.lastChild(parent) |
+ |
+ if sibling is not None and sibling.tag in self.SIBLING_TAGS: |
+ # Previous block was a list item, so set that as parent |
+ lst = sibling |
+ # make sure previous item is in a p- if the item has text, |
+ # then it isn't in a p |
+ if lst[-1].text: |
+ # since it's possible there are other children for this |
+ # sibling, we can't just SubElement the p, we need to |
+ # insert it as the first item. |
+ p = util.etree.Element('p') |
+ p.text = lst[-1].text |
+ lst[-1].text = '' |
+ lst[-1].insert(0, p) |
+ # if the last item has a tail, then the tail needs to be put in a p |
+ # likely only when a header is not followed by a blank line |
+ lch = self.lastChild(lst[-1]) |
+ if lch is not None and lch.tail: |
+ p = util.etree.SubElement(lst[-1], 'p') |
+ p.text = lch.tail.lstrip() |
+ lch.tail = '' |
+ |
+ # parse first block differently as it gets wrapped in a p. |
+ li = util.etree.SubElement(lst, 'li') |
+ self.parser.state.set('looselist') |
+ firstitem = items.pop(0) |
+ self.parser.parseBlocks(li, [firstitem]) |
+ self.parser.state.reset() |
+ elif parent.tag in ['ol', 'ul']: |
+ # this catches the edge case of a multi-item indented list whose |
+ # first item is in a blank parent-list item: |
+ # * * subitem1 |
+ # * subitem2 |
+ # see also ListIndentProcessor |
+ lst = parent |
+ else: |
+ # This is a new list so create parent with appropriate tag. |
+ lst = util.etree.SubElement(parent, self.TAG) |
+ # Check if a custom start integer is set |
+ if not self.parser.markdown.lazy_ol and self.STARTSWITH != '1': |
+ lst.attrib['start'] = self.STARTSWITH |
+ |
+ self.parser.state.set('list') |
+ # Loop through items in block, recursively parsing each with the |
+ # appropriate parent. |
+ for item in items: |
+ if item.startswith(' '*self.tab_length): |
+ # Item is indented. Parse with last item as parent |
+ self.parser.parseBlocks(lst[-1], [item]) |
+ else: |
+ # New item. Create li and parse with it as parent |
+ li = util.etree.SubElement(lst, 'li') |
+ self.parser.parseBlocks(li, [item]) |
+ self.parser.state.reset() |
+ |
+ def get_items(self, block): |
+ """ Break a block into list items. """ |
+ items = [] |
+ for line in block.split('\n'): |
+ m = self.CHILD_RE.match(line) |
+ if m: |
+ # This is a new list item |
+ # Check first item for the start index |
+ if not items and self.TAG == 'ol': |
+ # Detect the integer value of first list item |
+ INTEGER_RE = re.compile('(\d+)') |
+ self.STARTSWITH = INTEGER_RE.match(m.group(1)).group() |
+ # Append to the list |
+ items.append(m.group(3)) |
+ elif self.INDENT_RE.match(line): |
+ # This is an indented (possibly nested) item. |
+ if items[-1].startswith(' '*self.tab_length): |
+ # Previous item was indented. Append to that item. |
+ items[-1] = '%s\n%s' % (items[-1], line) |
+ else: |
+ items.append(line) |
+ else: |
+ # This is another line of previous item. Append to that item. |
+ items[-1] = '%s\n%s' % (items[-1], line) |
+ return items |
+ |
+ |
+class UListProcessor(OListProcessor): |
+ """ Process unordered list blocks. """ |
+ |
+ TAG = 'ul' |
+ RE = re.compile(r'^[ ]{0,3}[*+-][ ]+(.*)') |
+ |
+ |
+class HashHeaderProcessor(BlockProcessor): |
+ """ Process Hash Headers. """ |
+ |
+ # Detect a header at start of any line in block |
+ RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)') |
+ |
+ def test(self, parent, block): |
+ return bool(self.RE.search(block)) |
+ |
+ def run(self, parent, blocks): |
+ block = blocks.pop(0) |
+ m = self.RE.search(block) |
+ if m: |
+ before = block[:m.start()] # All lines before header |
+ after = block[m.end():] # All lines after header |
+ if before: |
+ # As the header was not the first line of the block and the |
+ # lines before the header must be parsed first, |
+ # recursively parse this lines as a block. |
+ self.parser.parseBlocks(parent, [before]) |
+ # Create header using named groups from RE |
+ h = util.etree.SubElement(parent, 'h%d' % len(m.group('level'))) |
+ h.text = m.group('header').strip() |
+ if after: |
+ # Insert remaining lines as first block for future parsing. |
+ blocks.insert(0, after) |
+ else: # pragma: no cover |
+ # This should never happen, but just in case... |
+ logger.warn("We've got a problem header: %r" % block) |
+ |
+ |
+class SetextHeaderProcessor(BlockProcessor): |
+ """ Process Setext-style Headers. """ |
+ |
+ # Detect Setext-style header. Must be first 2 lines of block. |
+ RE = re.compile(r'^.*?\n[=-]+[ ]*(\n|$)', re.MULTILINE) |
+ |
+ def test(self, parent, block): |
+ return bool(self.RE.match(block)) |
+ |
+ def run(self, parent, blocks): |
+ lines = blocks.pop(0).split('\n') |
+ # Determine level. ``=`` is 1 and ``-`` is 2. |
+ if lines[1].startswith('='): |
+ level = 1 |
+ else: |
+ level = 2 |
+ h = util.etree.SubElement(parent, 'h%d' % level) |
+ h.text = lines[0].strip() |
+ if len(lines) > 2: |
+ # Block contains additional lines. Add to master blocks for later. |
+ blocks.insert(0, '\n'.join(lines[2:])) |
+ |
+ |
+class HRProcessor(BlockProcessor): |
+ """ Process Horizontal Rules. """ |
+ |
+ RE = r'^[ ]{0,3}((-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,})[ ]*' |
+ # Detect hr on any line of a block. |
+ SEARCH_RE = re.compile(RE, re.MULTILINE) |
+ |
+ def test(self, parent, block): |
+ m = self.SEARCH_RE.search(block) |
+ # No atomic grouping in python so we simulate it here for performance. |
+ # The regex only matches what would be in the atomic group - the HR. |
+ # Then check if we are at end of block or if next char is a newline. |
+ if m and (m.end() == len(block) or block[m.end()] == '\n'): |
+ # Save match object on class instance so we can use it later. |
+ self.match = m |
+ return True |
+ return False |
+ |
+ def run(self, parent, blocks): |
+ block = blocks.pop(0) |
+ # Check for lines in block before hr. |
+ prelines = block[:self.match.start()].rstrip('\n') |
+ if prelines: |
+ # Recursively parse lines before hr so they get parsed first. |
+ self.parser.parseBlocks(parent, [prelines]) |
+ # create hr |
+ util.etree.SubElement(parent, 'hr') |
+ # check for lines in block after hr. |
+ postlines = block[self.match.end():].lstrip('\n') |
+ if postlines: |
+ # Add lines after hr to master blocks for later parsing. |
+ blocks.insert(0, postlines) |
+ |
+ |
+class EmptyBlockProcessor(BlockProcessor): |
+ """ Process blocks that are empty or start with an empty line. """ |
+ |
+ def test(self, parent, block): |
+ return not block or block.startswith('\n') |
+ |
+ def run(self, parent, blocks): |
+ block = blocks.pop(0) |
+ filler = '\n\n' |
+ if block: |
+ # Starts with empty line |
+ # Only replace a single line. |
+ filler = '\n' |
+ # Save the rest for later. |
+ theRest = block[1:] |
+ if theRest: |
+ # Add remaining lines to master blocks for later. |
+ blocks.insert(0, theRest) |
+ sibling = self.lastChild(parent) |
+ if (sibling is not None and sibling.tag == 'pre' and |
+ len(sibling) and sibling[0].tag == 'code'): |
+ # Last block is a codeblock. Append to preserve whitespace. |
+ sibling[0].text = util.AtomicString( |
+ '%s%s' % (sibling[0].text, filler) |
+ ) |
+ |
+ |
+class ParagraphProcessor(BlockProcessor): |
+ """ Process Paragraph blocks. """ |
+ |
+ def test(self, parent, block): |
+ return True |
+ |
+ def run(self, parent, blocks): |
+ block = blocks.pop(0) |
+ if block.strip(): |
+ # Not a blank block. Add to parent, otherwise throw it away. |
+ if self.parser.state.isstate('list'): |
+ # The parent is a tight-list. |
+ # |
+ # Check for any children. This will likely only happen in a |
+ # tight-list when a header isn't followed by a blank line. |
+ # For example: |
+ # |
+ # * # Header |
+ # Line 2 of list item - not part of header. |
+ sibling = self.lastChild(parent) |
+ if sibling is not None: |
+ # Insetrt after sibling. |
+ if sibling.tail: |
+ sibling.tail = '%s\n%s' % (sibling.tail, block) |
+ else: |
+ sibling.tail = '\n%s' % block |
+ else: |
+ # Append to parent.text |
+ if parent.text: |
+ parent.text = '%s\n%s' % (parent.text, block) |
+ else: |
+ parent.text = block.lstrip() |
+ else: |
+ # Create a regular paragraph |
+ p = util.etree.SubElement(parent, 'p') |
+ p.text = block.lstrip() |