| Index: third_party/markdown/blockprocessors.py
|
| diff --git a/third_party/markdown/blockprocessors.py b/third_party/markdown/blockprocessors.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..61977b49a81227bdea69e515c0b69b10d14ebe9f
|
| --- /dev/null
|
| +++ b/third_party/markdown/blockprocessors.py
|
| @@ -0,0 +1,558 @@
|
| +"""
|
| +CORE MARKDOWN BLOCKPARSER
|
| +===========================================================================
|
| +
|
| +This parser handles basic parsing of Markdown blocks. It doesn't concern itself
|
| +with inline elements such as **bold** or *italics*, but rather just catches
|
| +blocks, lists, quotes, etc.
|
| +
|
| +The BlockParser is made up of a bunch of BlockProssors, each handling a
|
| +different type of block. Extensions may add/replace/remove BlockProcessors
|
| +as they need to alter how markdown blocks are parsed.
|
| +"""
|
| +
|
| +from __future__ import absolute_import
|
| +from __future__ import division
|
| +from __future__ import unicode_literals
|
| +import logging
|
| +import re
|
| +from . import util
|
| +from .blockparser import BlockParser
|
| +
|
| +logger = logging.getLogger('MARKDOWN')
|
| +
|
| +
|
| +def build_block_parser(md_instance, **kwargs):
|
| + """ Build the default block parser used by Markdown. """
|
| + parser = BlockParser(md_instance)
|
| + parser.blockprocessors['empty'] = EmptyBlockProcessor(parser)
|
| + parser.blockprocessors['indent'] = ListIndentProcessor(parser)
|
| + parser.blockprocessors['code'] = CodeBlockProcessor(parser)
|
| + parser.blockprocessors['hashheader'] = HashHeaderProcessor(parser)
|
| + parser.blockprocessors['setextheader'] = SetextHeaderProcessor(parser)
|
| + parser.blockprocessors['hr'] = HRProcessor(parser)
|
| + parser.blockprocessors['olist'] = OListProcessor(parser)
|
| + parser.blockprocessors['ulist'] = UListProcessor(parser)
|
| + parser.blockprocessors['quote'] = BlockQuoteProcessor(parser)
|
| + parser.blockprocessors['paragraph'] = ParagraphProcessor(parser)
|
| + return parser
|
| +
|
| +
|
| +class BlockProcessor:
|
| + """ Base class for block processors.
|
| +
|
| + Each subclass will provide the methods below to work with the source and
|
| + tree. Each processor will need to define it's own ``test`` and ``run``
|
| + methods. The ``test`` method should return True or False, to indicate
|
| + whether the current block should be processed by this processor. If the
|
| + test passes, the parser will call the processors ``run`` method.
|
| +
|
| + """
|
| +
|
| + def __init__(self, parser):
|
| + self.parser = parser
|
| + self.tab_length = parser.markdown.tab_length
|
| +
|
| + def lastChild(self, parent):
|
| + """ Return the last child of an etree element. """
|
| + if len(parent):
|
| + return parent[-1]
|
| + else:
|
| + return None
|
| +
|
| + def detab(self, text):
|
| + """ Remove a tab from the front of each line of the given text. """
|
| + newtext = []
|
| + lines = text.split('\n')
|
| + for line in lines:
|
| + if line.startswith(' '*self.tab_length):
|
| + newtext.append(line[self.tab_length:])
|
| + elif not line.strip():
|
| + newtext.append('')
|
| + else:
|
| + break
|
| + return '\n'.join(newtext), '\n'.join(lines[len(newtext):])
|
| +
|
| + def looseDetab(self, text, level=1):
|
| + """ Remove a tab from front of lines but allowing dedented lines. """
|
| + lines = text.split('\n')
|
| + for i in range(len(lines)):
|
| + if lines[i].startswith(' '*self.tab_length*level):
|
| + lines[i] = lines[i][self.tab_length*level:]
|
| + return '\n'.join(lines)
|
| +
|
| + def test(self, parent, block):
|
| + """ Test for block type. Must be overridden by subclasses.
|
| +
|
| + As the parser loops through processors, it will call the ``test`` method
|
| + on each to determine if the given block of text is of that type. This
|
| + method must return a boolean ``True`` or ``False``. The actual method of
|
| + testing is left to the needs of that particular block type. It could
|
| + be as simple as ``block.startswith(some_string)`` or a complex regular
|
| + expression. As the block type may be different depending on the parent
|
| + of the block (i.e. inside a list), the parent etree element is also
|
| + provided and may be used as part of the test.
|
| +
|
| + Keywords:
|
| +
|
| + * ``parent``: A etree element which will be the parent of the block.
|
| + * ``block``: A block of text from the source which has been split at
|
| + blank lines.
|
| + """
|
| + pass
|
| +
|
| + def run(self, parent, blocks):
|
| + """ Run processor. Must be overridden by subclasses.
|
| +
|
| + When the parser determines the appropriate type of a block, the parser
|
| + will call the corresponding processor's ``run`` method. This method
|
| + should parse the individual lines of the block and append them to
|
| + the etree.
|
| +
|
| + Note that both the ``parent`` and ``etree`` keywords are pointers
|
| + to instances of the objects which should be edited in place. Each
|
| + processor must make changes to the existing objects as there is no
|
| + mechanism to return new/different objects to replace them.
|
| +
|
| + This means that this method should be adding SubElements or adding text
|
| + to the parent, and should remove (``pop``) or add (``insert``) items to
|
| + the list of blocks.
|
| +
|
| + Keywords:
|
| +
|
| + * ``parent``: A etree element which is the parent of the current block.
|
| + * ``blocks``: A list of all remaining blocks of the document.
|
| + """
|
| + pass
|
| +
|
| +
|
| +class ListIndentProcessor(BlockProcessor):
|
| + """ Process children of list items.
|
| +
|
| + Example:
|
| + * a list item
|
| + process this part
|
| +
|
| + or this part
|
| +
|
| + """
|
| +
|
| + ITEM_TYPES = ['li']
|
| + LIST_TYPES = ['ul', 'ol']
|
| +
|
| + def __init__(self, *args):
|
| + BlockProcessor.__init__(self, *args)
|
| + self.INDENT_RE = re.compile(r'^(([ ]{%s})+)'% self.tab_length)
|
| +
|
| + def test(self, parent, block):
|
| + return block.startswith(' '*self.tab_length) and \
|
| + not self.parser.state.isstate('detabbed') and \
|
| + (parent.tag in self.ITEM_TYPES or \
|
| + (len(parent) and parent[-1] and \
|
| + (parent[-1].tag in self.LIST_TYPES)
|
| + )
|
| + )
|
| +
|
| + def run(self, parent, blocks):
|
| + block = blocks.pop(0)
|
| + level, sibling = self.get_level(parent, block)
|
| + block = self.looseDetab(block, level)
|
| +
|
| + self.parser.state.set('detabbed')
|
| + if parent.tag in self.ITEM_TYPES:
|
| + # It's possible that this parent has a 'ul' or 'ol' child list
|
| + # with a member. If that is the case, then that should be the
|
| + # parent. This is intended to catch the edge case of an indented
|
| + # list whose first member was parsed previous to this point
|
| + # see OListProcessor
|
| + if len(parent) and parent[-1].tag in self.LIST_TYPES:
|
| + self.parser.parseBlocks(parent[-1], [block])
|
| + else:
|
| + # The parent is already a li. Just parse the child block.
|
| + self.parser.parseBlocks(parent, [block])
|
| + elif sibling.tag in self.ITEM_TYPES:
|
| + # The sibling is a li. Use it as parent.
|
| + self.parser.parseBlocks(sibling, [block])
|
| + elif len(sibling) and sibling[-1].tag in self.ITEM_TYPES:
|
| + # The parent is a list (``ol`` or ``ul``) which has children.
|
| + # Assume the last child li is the parent of this block.
|
| + if sibling[-1].text:
|
| + # If the parent li has text, that text needs to be moved to a p
|
| + # The p must be 'inserted' at beginning of list in the event
|
| + # that other children already exist i.e.; a nested sublist.
|
| + p = util.etree.Element('p')
|
| + p.text = sibling[-1].text
|
| + sibling[-1].text = ''
|
| + sibling[-1].insert(0, p)
|
| + self.parser.parseChunk(sibling[-1], block)
|
| + else:
|
| + self.create_item(sibling, block)
|
| + self.parser.state.reset()
|
| +
|
| + def create_item(self, parent, block):
|
| + """ Create a new li and parse the block with it as the parent. """
|
| + li = util.etree.SubElement(parent, 'li')
|
| + self.parser.parseBlocks(li, [block])
|
| +
|
| + def get_level(self, parent, block):
|
| + """ Get level of indent based on list level. """
|
| + # Get indent level
|
| + m = self.INDENT_RE.match(block)
|
| + if m:
|
| + indent_level = len(m.group(1))/self.tab_length
|
| + else:
|
| + indent_level = 0
|
| + if self.parser.state.isstate('list'):
|
| + # We're in a tightlist - so we already are at correct parent.
|
| + level = 1
|
| + else:
|
| + # We're in a looselist - so we need to find parent.
|
| + level = 0
|
| + # Step through children of tree to find matching indent level.
|
| + while indent_level > level:
|
| + child = self.lastChild(parent)
|
| + if child and (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES):
|
| + if child.tag in self.LIST_TYPES:
|
| + level += 1
|
| + parent = child
|
| + else:
|
| + # No more child levels. If we're short of indent_level,
|
| + # we have a code block. So we stop here.
|
| + break
|
| + return level, parent
|
| +
|
| +
|
| +class CodeBlockProcessor(BlockProcessor):
|
| + """ Process code blocks. """
|
| +
|
| + def test(self, parent, block):
|
| + return block.startswith(' '*self.tab_length)
|
| +
|
| + def run(self, parent, blocks):
|
| + sibling = self.lastChild(parent)
|
| + block = blocks.pop(0)
|
| + theRest = ''
|
| + if sibling and sibling.tag == "pre" and len(sibling) \
|
| + and sibling[0].tag == "code":
|
| + # The previous block was a code block. As blank lines do not start
|
| + # new code blocks, append this block to the previous, adding back
|
| + # linebreaks removed from the split into a list.
|
| + code = sibling[0]
|
| + block, theRest = self.detab(block)
|
| + code.text = util.AtomicString('%s\n%s\n' % (code.text, block.rstrip()))
|
| + else:
|
| + # This is a new codeblock. Create the elements and insert text.
|
| + pre = util.etree.SubElement(parent, 'pre')
|
| + code = util.etree.SubElement(pre, 'code')
|
| + block, theRest = self.detab(block)
|
| + code.text = util.AtomicString('%s\n' % block.rstrip())
|
| + if theRest:
|
| + # This block contained unindented line(s) after the first indented
|
| + # line. Insert these lines as the first block of the master blocks
|
| + # list for future processing.
|
| + blocks.insert(0, theRest)
|
| +
|
| +
|
| +class BlockQuoteProcessor(BlockProcessor):
|
| +
|
| + RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)')
|
| +
|
| + def test(self, parent, block):
|
| + return bool(self.RE.search(block))
|
| +
|
| + def run(self, parent, blocks):
|
| + block = blocks.pop(0)
|
| + m = self.RE.search(block)
|
| + if m:
|
| + before = block[:m.start()] # Lines before blockquote
|
| + # Pass lines before blockquote in recursively for parsing forst.
|
| + self.parser.parseBlocks(parent, [before])
|
| + # Remove ``> `` from begining of each line.
|
| + block = '\n'.join([self.clean(line) for line in
|
| + block[m.start():].split('\n')])
|
| + sibling = self.lastChild(parent)
|
| + if sibling and sibling.tag == "blockquote":
|
| + # Previous block was a blockquote so set that as this blocks parent
|
| + quote = sibling
|
| + else:
|
| + # This is a new blockquote. Create a new parent element.
|
| + quote = util.etree.SubElement(parent, 'blockquote')
|
| + # Recursively parse block with blockquote as parent.
|
| + # change parser state so blockquotes embedded in lists use p tags
|
| + self.parser.state.set('blockquote')
|
| + self.parser.parseChunk(quote, block)
|
| + self.parser.state.reset()
|
| +
|
| + def clean(self, line):
|
| + """ Remove ``>`` from beginning of a line. """
|
| + m = self.RE.match(line)
|
| + if line.strip() == ">":
|
| + return ""
|
| + elif m:
|
| + return m.group(2)
|
| + else:
|
| + return line
|
| +
|
| +class OListProcessor(BlockProcessor):
|
| + """ Process ordered list blocks. """
|
| +
|
| + TAG = 'ol'
|
| + # Detect an item (``1. item``). ``group(1)`` contains contents of item.
|
| + RE = re.compile(r'^[ ]{0,3}\d+\.[ ]+(.*)')
|
| + # Detect items on secondary lines. they can be of either list type.
|
| + CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ]+(.*)')
|
| + # Detect indented (nested) items of either type
|
| + INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ]+.*')
|
| + # The integer (python string) with which the lists starts (default=1)
|
| + # Eg: If list is intialized as)
|
| + # 3. Item
|
| + # The ol tag will get starts="3" attribute
|
| + STARTSWITH = '1'
|
| + # List of allowed sibling tags.
|
| + SIBLING_TAGS = ['ol', 'ul']
|
| +
|
| + def test(self, parent, block):
|
| + return bool(self.RE.match(block))
|
| +
|
| + def run(self, parent, blocks):
|
| + # Check fr multiple items in one block.
|
| + items = self.get_items(blocks.pop(0))
|
| + sibling = self.lastChild(parent)
|
| +
|
| + if sibling and sibling.tag in self.SIBLING_TAGS:
|
| + # Previous block was a list item, so set that as parent
|
| + lst = sibling
|
| + # make sure previous item is in a p- if the item has text, then it
|
| + # it isn't in a p
|
| + if lst[-1].text:
|
| + # since it's possible there are other children for this sibling,
|
| + # we can't just SubElement the p, we need to insert it as the
|
| + # first item
|
| + p = util.etree.Element('p')
|
| + p.text = lst[-1].text
|
| + lst[-1].text = ''
|
| + lst[-1].insert(0, p)
|
| + # if the last item has a tail, then the tail needs to be put in a p
|
| + # likely only when a header is not followed by a blank line
|
| + lch = self.lastChild(lst[-1])
|
| + if lch is not None and lch.tail:
|
| + p = util.etree.SubElement(lst[-1], 'p')
|
| + p.text = lch.tail.lstrip()
|
| + lch.tail = ''
|
| +
|
| + # parse first block differently as it gets wrapped in a p.
|
| + li = util.etree.SubElement(lst, 'li')
|
| + self.parser.state.set('looselist')
|
| + firstitem = items.pop(0)
|
| + self.parser.parseBlocks(li, [firstitem])
|
| + self.parser.state.reset()
|
| + elif parent.tag in ['ol', 'ul']:
|
| + # this catches the edge case of a multi-item indented list whose
|
| + # first item is in a blank parent-list item:
|
| + # * * subitem1
|
| + # * subitem2
|
| + # see also ListIndentProcessor
|
| + lst = parent
|
| + else:
|
| + # This is a new list so create parent with appropriate tag.
|
| + lst = util.etree.SubElement(parent, self.TAG)
|
| + # Check if a custom start integer is set
|
| + if not self.parser.markdown.lazy_ol and self.STARTSWITH !='1':
|
| + lst.attrib['start'] = self.STARTSWITH
|
| +
|
| + self.parser.state.set('list')
|
| + # Loop through items in block, recursively parsing each with the
|
| + # appropriate parent.
|
| + for item in items:
|
| + if item.startswith(' '*self.tab_length):
|
| + # Item is indented. Parse with last item as parent
|
| + self.parser.parseBlocks(lst[-1], [item])
|
| + else:
|
| + # New item. Create li and parse with it as parent
|
| + li = util.etree.SubElement(lst, 'li')
|
| + self.parser.parseBlocks(li, [item])
|
| + self.parser.state.reset()
|
| +
|
| + def get_items(self, block):
|
| + """ Break a block into list items. """
|
| + items = []
|
| + for line in block.split('\n'):
|
| + m = self.CHILD_RE.match(line)
|
| + if m:
|
| + # This is a new list item
|
| + # Check first item for the start index
|
| + if not items and self.TAG=='ol':
|
| + # Detect the integer value of first list item
|
| + INTEGER_RE = re.compile('(\d+)')
|
| + self.STARTSWITH = INTEGER_RE.match(m.group(1)).group()
|
| + # Append to the list
|
| + items.append(m.group(3))
|
| + elif self.INDENT_RE.match(line):
|
| + # This is an indented (possibly nested) item.
|
| + if items[-1].startswith(' '*self.tab_length):
|
| + # Previous item was indented. Append to that item.
|
| + items[-1] = '%s\n%s' % (items[-1], line)
|
| + else:
|
| + items.append(line)
|
| + else:
|
| + # This is another line of previous item. Append to that item.
|
| + items[-1] = '%s\n%s' % (items[-1], line)
|
| + return items
|
| +
|
| +
|
| +class UListProcessor(OListProcessor):
|
| + """ Process unordered list blocks. """
|
| +
|
| + TAG = 'ul'
|
| + RE = re.compile(r'^[ ]{0,3}[*+-][ ]+(.*)')
|
| +
|
| +
|
| +class HashHeaderProcessor(BlockProcessor):
|
| + """ Process Hash Headers. """
|
| +
|
| + # Detect a header at start of any line in block
|
| + RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)')
|
| +
|
| + def test(self, parent, block):
|
| + return bool(self.RE.search(block))
|
| +
|
| + def run(self, parent, blocks):
|
| + block = blocks.pop(0)
|
| + m = self.RE.search(block)
|
| + if m:
|
| + before = block[:m.start()] # All lines before header
|
| + after = block[m.end():] # All lines after header
|
| + if before:
|
| + # As the header was not the first line of the block and the
|
| + # lines before the header must be parsed first,
|
| + # recursively parse this lines as a block.
|
| + self.parser.parseBlocks(parent, [before])
|
| + # Create header using named groups from RE
|
| + h = util.etree.SubElement(parent, 'h%d' % len(m.group('level')))
|
| + h.text = m.group('header').strip()
|
| + if after:
|
| + # Insert remaining lines as first block for future parsing.
|
| + blocks.insert(0, after)
|
| + else:
|
| + # This should never happen, but just in case...
|
| + logger.warn("We've got a problem header: %r" % block)
|
| +
|
| +
|
| +class SetextHeaderProcessor(BlockProcessor):
|
| + """ Process Setext-style Headers. """
|
| +
|
| + # Detect Setext-style header. Must be first 2 lines of block.
|
| + RE = re.compile(r'^.*?\n[=-]+[ ]*(\n|$)', re.MULTILINE)
|
| +
|
| + def test(self, parent, block):
|
| + return bool(self.RE.match(block))
|
| +
|
| + def run(self, parent, blocks):
|
| + lines = blocks.pop(0).split('\n')
|
| + # Determine level. ``=`` is 1 and ``-`` is 2.
|
| + if lines[1].startswith('='):
|
| + level = 1
|
| + else:
|
| + level = 2
|
| + h = util.etree.SubElement(parent, 'h%d' % level)
|
| + h.text = lines[0].strip()
|
| + if len(lines) > 2:
|
| + # Block contains additional lines. Add to master blocks for later.
|
| + blocks.insert(0, '\n'.join(lines[2:]))
|
| +
|
| +
|
| +class HRProcessor(BlockProcessor):
|
| + """ Process Horizontal Rules. """
|
| +
|
| + RE = r'^[ ]{0,3}((-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,})[ ]*'
|
| + # Detect hr on any line of a block.
|
| + SEARCH_RE = re.compile(RE, re.MULTILINE)
|
| +
|
| + def test(self, parent, block):
|
| + m = self.SEARCH_RE.search(block)
|
| + # No atomic grouping in python so we simulate it here for performance.
|
| + # The regex only matches what would be in the atomic group - the HR.
|
| + # Then check if we are at end of block or if next char is a newline.
|
| + if m and (m.end() == len(block) or block[m.end()] == '\n'):
|
| + # Save match object on class instance so we can use it later.
|
| + self.match = m
|
| + return True
|
| + return False
|
| +
|
| + def run(self, parent, blocks):
|
| + block = blocks.pop(0)
|
| + # Check for lines in block before hr.
|
| + prelines = block[:self.match.start()].rstrip('\n')
|
| + if prelines:
|
| + # Recursively parse lines before hr so they get parsed first.
|
| + self.parser.parseBlocks(parent, [prelines])
|
| + # create hr
|
| + util.etree.SubElement(parent, 'hr')
|
| + # check for lines in block after hr.
|
| + postlines = block[self.match.end():].lstrip('\n')
|
| + if postlines:
|
| + # Add lines after hr to master blocks for later parsing.
|
| + blocks.insert(0, postlines)
|
| +
|
| +
|
| +
|
| +class EmptyBlockProcessor(BlockProcessor):
|
| + """ Process blocks that are empty or start with an empty line. """
|
| +
|
| + def test(self, parent, block):
|
| + return not block or block.startswith('\n')
|
| +
|
| + def run(self, parent, blocks):
|
| + block = blocks.pop(0)
|
| + filler = '\n\n'
|
| + if block:
|
| + # Starts with empty line
|
| + # Only replace a single line.
|
| + filler = '\n'
|
| + # Save the rest for later.
|
| + theRest = block[1:]
|
| + if theRest:
|
| + # Add remaining lines to master blocks for later.
|
| + blocks.insert(0, theRest)
|
| + sibling = self.lastChild(parent)
|
| + if sibling and sibling.tag == 'pre' and len(sibling) and sibling[0].tag == 'code':
|
| + # Last block is a codeblock. Append to preserve whitespace.
|
| + sibling[0].text = util.AtomicString('%s%s' % (sibling[0].text, filler))
|
| +
|
| +
|
| +class ParagraphProcessor(BlockProcessor):
|
| + """ Process Paragraph blocks. """
|
| +
|
| + def test(self, parent, block):
|
| + return True
|
| +
|
| + def run(self, parent, blocks):
|
| + block = blocks.pop(0)
|
| + if block.strip():
|
| + # Not a blank block. Add to parent, otherwise throw it away.
|
| + if self.parser.state.isstate('list'):
|
| + # The parent is a tight-list.
|
| + #
|
| + # Check for any children. This will likely only happen in a
|
| + # tight-list when a header isn't followed by a blank line.
|
| + # For example:
|
| + #
|
| + # * # Header
|
| + # Line 2 of list item - not part of header.
|
| + sibling = self.lastChild(parent)
|
| + if sibling is not None:
|
| + # Insetrt after sibling.
|
| + if sibling.tail:
|
| + sibling.tail = '%s\n%s' % (sibling.tail, block)
|
| + else:
|
| + sibling.tail = '\n%s' % block
|
| + else:
|
| + # Append to parent.text
|
| + if parent.text:
|
| + parent.text = '%s\n%s' % (parent.text, block)
|
| + else:
|
| + parent.text = block.lstrip()
|
| + else:
|
| + # Create a regular paragraph
|
| + p = util.etree.SubElement(parent, 'p')
|
| + p.text = block.lstrip()
|
|
|