third_party/Python-Markdown/markdown/blockprocessors.py - Issue 1392733002: Re-land "Check in a simple pure-python based Markdown previewer."

Unified Diff: third_party/Python-Markdown/markdown/blockprocessors.py

Issue 1392733002: Re-land "Check in a simple pure-python based Markdown previewer." (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: clarify comment re: licenses, add bug #, use --no-find-copies Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « third_party/Python-Markdown/markdown/blockparser.py ('k') | third_party/Python-Markdown/markdown/extensions/__init__.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/Python-Markdown/markdown/blockprocessors.py

diff --git a/third_party/Python-Markdown/markdown/blockprocessors.py b/third_party/Python-Markdown/markdown/blockprocessors.py

new file mode 100644

index 0000000000000000000000000000000000000000..29db022cee111b062818853bebaf99d6ffa1dcba

--- /dev/null

+++ b/third_party/Python-Markdown/markdown/blockprocessors.py

@@ -0,0 +1,563 @@

+"""

+CORE MARKDOWN BLOCKPARSER

+===========================================================================

+This parser handles basic parsing of Markdown blocks. It doesn't concern

+itself with inline elements such as **bold** or *italics*, but rather just

+catches blocks, lists, quotes, etc.

+The BlockParser is made up of a bunch of BlockProssors, each handling a

+different type of block. Extensions may add/replace/remove BlockProcessors

+as they need to alter how markdown blocks are parsed.

+"""

+from __future__ import absolute_import

+from __future__ import division

+from __future__ import unicode_literals

+import logging

+import re

+from . import util

+from .blockparser import BlockParser

+logger = logging.getLogger('MARKDOWN')

+def build_block_parser(md_instance, **kwargs):

+ """ Build the default block parser used by Markdown. """

+ parser = BlockParser(md_instance)

+ parser.blockprocessors['empty'] = EmptyBlockProcessor(parser)

+ parser.blockprocessors['indent'] = ListIndentProcessor(parser)

+ parser.blockprocessors['code'] = CodeBlockProcessor(parser)

+ parser.blockprocessors['hashheader'] = HashHeaderProcessor(parser)

+ parser.blockprocessors['setextheader'] = SetextHeaderProcessor(parser)

+ parser.blockprocessors['hr'] = HRProcessor(parser)

+ parser.blockprocessors['olist'] = OListProcessor(parser)

+ parser.blockprocessors['ulist'] = UListProcessor(parser)

+ parser.blockprocessors['quote'] = BlockQuoteProcessor(parser)

+ parser.blockprocessors['paragraph'] = ParagraphProcessor(parser)

+ return parser

+class BlockProcessor:

+ """ Base class for block processors.

+ Each subclass will provide the methods below to work with the source and

+ tree. Each processor will need to define it's own ``test`` and ``run``

+ methods. The ``test`` method should return True or False, to indicate

+ whether the current block should be processed by this processor. If the

+ test passes, the parser will call the processors ``run`` method.

+ """

+ def __init__(self, parser):

+ self.parser = parser

+ self.tab_length = parser.markdown.tab_length

+ def lastChild(self, parent):

+ """ Return the last child of an etree element. """

+ if len(parent):

+ return parent[-1]

+ else:

+ return None

+ def detab(self, text):

+ """ Remove a tab from the front of each line of the given text. """

+ newtext = []

+ lines = text.split('\n')

+ for line in lines:

+ if line.startswith(' '*self.tab_length):

+ newtext.append(line[self.tab_length:])

+ elif not line.strip():

+ newtext.append('')

+ else:

+ break

+ return '\n'.join(newtext), '\n'.join(lines[len(newtext):])

+ def looseDetab(self, text, level=1):

+ """ Remove a tab from front of lines but allowing dedented lines. """

+ lines = text.split('\n')

+ for i in range(len(lines)):

+ if lines[i].startswith(' '*self.tab_length*level):

+ lines[i] = lines[i][self.tab_length*level:]

+ return '\n'.join(lines)

+ def test(self, parent, block):

+ """ Test for block type. Must be overridden by subclasses.

+ As the parser loops through processors, it will call the ``test``

+ method on each to determine if the given block of text is of that

+ type. This method must return a boolean ``True`` or ``False``. The

+ actual method of testing is left to the needs of that particular

+ block type. It could be as simple as ``block.startswith(some_string)``

+ or a complex regular expression. As the block type may be different

+ depending on the parent of the block (i.e. inside a list), the parent

+ etree element is also provided and may be used as part of the test.

+ Keywords:

+ * ``parent``: A etree element which will be the parent of the block.

+ * ``block``: A block of text from the source which has been split at

+ blank lines.

+ """

+ pass # pragma: no cover

+ def run(self, parent, blocks):

+ """ Run processor. Must be overridden by subclasses.

+ When the parser determines the appropriate type of a block, the parser

+ will call the corresponding processor's ``run`` method. This method

+ should parse the individual lines of the block and append them to

+ the etree.

+ Note that both the ``parent`` and ``etree`` keywords are pointers

+ to instances of the objects which should be edited in place. Each

+ processor must make changes to the existing objects as there is no

+ mechanism to return new/different objects to replace them.

+ This means that this method should be adding SubElements or adding text

+ to the parent, and should remove (``pop``) or add (``insert``) items to

+ the list of blocks.

+ Keywords:

+ * ``parent``: A etree element which is the parent of the current block.

+ * ``blocks``: A list of all remaining blocks of the document.

+ """

+ pass # pragma: no cover

+class ListIndentProcessor(BlockProcessor):

+ """ Process children of list items.

+ Example:

+ * a list item

+ process this part

+ or this part

+ """

+ ITEM_TYPES = ['li']

+ LIST_TYPES = ['ul', 'ol']

+ def __init__(self, *args):

+ BlockProcessor.__init__(self, *args)

+ self.INDENT_RE = re.compile(r'^(([ ]{%s})+)' % self.tab_length)

+ def test(self, parent, block):

+ return block.startswith(' '*self.tab_length) and \

+ not self.parser.state.isstate('detabbed') and \

+ (parent.tag in self.ITEM_TYPES or

+ (len(parent) and parent[-1] is not None and

+ (parent[-1].tag in self.LIST_TYPES)))

+ def run(self, parent, blocks):

+ block = blocks.pop(0)

+ level, sibling = self.get_level(parent, block)

+ block = self.looseDetab(block, level)

+ self.parser.state.set('detabbed')

+ if parent.tag in self.ITEM_TYPES:

+ # It's possible that this parent has a 'ul' or 'ol' child list

+ # with a member. If that is the case, then that should be the

+ # parent. This is intended to catch the edge case of an indented

+ # list whose first member was parsed previous to this point

+ # see OListProcessor

+ if len(parent) and parent[-1].tag in self.LIST_TYPES:

+ self.parser.parseBlocks(parent[-1], [block])

+ else:

+ # The parent is already a li. Just parse the child block.

+ self.parser.parseBlocks(parent, [block])

+ elif sibling.tag in self.ITEM_TYPES:

+ # The sibling is a li. Use it as parent.

+ self.parser.parseBlocks(sibling, [block])

+ elif len(sibling) and sibling[-1].tag in self.ITEM_TYPES:

+ # The parent is a list (``ol`` or ``ul``) which has children.

+ # Assume the last child li is the parent of this block.

+ if sibling[-1].text:

+ # If the parent li has text, that text needs to be moved to a p

+ # The p must be 'inserted' at beginning of list in the event

+ # that other children already exist i.e.; a nested sublist.

+ p = util.etree.Element('p')

+ p.text = sibling[-1].text

+ sibling[-1].text = ''

+ sibling[-1].insert(0, p)

+ self.parser.parseChunk(sibling[-1], block)

+ else:

+ self.create_item(sibling, block)

+ self.parser.state.reset()

+ def create_item(self, parent, block):

+ """ Create a new li and parse the block with it as the parent. """

+ li = util.etree.SubElement(parent, 'li')

+ self.parser.parseBlocks(li, [block])

+ def get_level(self, parent, block):

+ """ Get level of indent based on list level. """

+ # Get indent level

+ m = self.INDENT_RE.match(block)

+ if m:

+ indent_level = len(m.group(1))/self.tab_length

+ else:

+ indent_level = 0

+ if self.parser.state.isstate('list'):

+ # We're in a tightlist - so we already are at correct parent.

+ level = 1

+ else:

+ # We're in a looselist - so we need to find parent.

+ level = 0

+ # Step through children of tree to find matching indent level.

+ while indent_level > level:

+ child = self.lastChild(parent)

+ if (child is not None and

+ (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES)):

+ if child.tag in self.LIST_TYPES:

+ level += 1

+ parent = child

+ else:

+ # No more child levels. If we're short of indent_level,

+ # we have a code block. So we stop here.

+ break

+ return level, parent

+class CodeBlockProcessor(BlockProcessor):

+ """ Process code blocks. """

+ def test(self, parent, block):

+ return block.startswith(' '*self.tab_length)

+ def run(self, parent, blocks):

+ sibling = self.lastChild(parent)

+ block = blocks.pop(0)

+ theRest = ''

+ if (sibling is not None and sibling.tag == "pre" and

+ len(sibling) and sibling[0].tag == "code"):

+ # The previous block was a code block. As blank lines do not start

+ # new code blocks, append this block to the previous, adding back

+ # linebreaks removed from the split into a list.

+ code = sibling[0]

+ block, theRest = self.detab(block)

+ code.text = util.AtomicString(

+ '%s\n%s\n' % (code.text, block.rstrip())

+ )

+ else:

+ # This is a new codeblock. Create the elements and insert text.

+ pre = util.etree.SubElement(parent, 'pre')

+ code = util.etree.SubElement(pre, 'code')

+ block, theRest = self.detab(block)

+ code.text = util.AtomicString('%s\n' % block.rstrip())

+ if theRest:

+ # This block contained unindented line(s) after the first indented

+ # line. Insert these lines as the first block of the master blocks

+ # list for future processing.

+ blocks.insert(0, theRest)

+class BlockQuoteProcessor(BlockProcessor):

+ RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)')

+ def test(self, parent, block):

+ return bool(self.RE.search(block))

+ def run(self, parent, blocks):

+ block = blocks.pop(0)

+ m = self.RE.search(block)

+ if m:

+ before = block[:m.start()] # Lines before blockquote

+ # Pass lines before blockquote in recursively for parsing forst.

+ self.parser.parseBlocks(parent, [before])

+ # Remove ``> `` from begining of each line.

+ block = '\n'.join(

+ [self.clean(line) for line in block[m.start():].split('\n')]

+ )

+ sibling = self.lastChild(parent)

+ if sibling is not None and sibling.tag == "blockquote":

+ # Previous block was a blockquote so set that as this blocks parent

+ quote = sibling

+ else:

+ # This is a new blockquote. Create a new parent element.

+ quote = util.etree.SubElement(parent, 'blockquote')

+ # Recursively parse block with blockquote as parent.

+ # change parser state so blockquotes embedded in lists use p tags

+ self.parser.state.set('blockquote')

+ self.parser.parseChunk(quote, block)

+ self.parser.state.reset()

+ def clean(self, line):

+ """ Remove ``>`` from beginning of a line. """

+ m = self.RE.match(line)

+ if line.strip() == ">":

+ return ""

+ elif m:

+ return m.group(2)

+ else:

+ return line

+class OListProcessor(BlockProcessor):

+ """ Process ordered list blocks. """

+ TAG = 'ol'

+ # Detect an item (``1. item``). ``group(1)`` contains contents of item.

+ RE = re.compile(r'^[ ]{0,3}\d+\.[ ]+(.*)')

+ # Detect items on secondary lines. they can be of either list type.

+ CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ]+(.*)')

+ # Detect indented (nested) items of either type

+ INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ]+.*')

+ # The integer (python string) with which the lists starts (default=1)

+ # Eg: If list is intialized as)

+ # 3. Item

+ # The ol tag will get starts="3" attribute

+ STARTSWITH = '1'

+ # List of allowed sibling tags.

+ SIBLING_TAGS = ['ol', 'ul']

+ def test(self, parent, block):

+ return bool(self.RE.match(block))

+ def run(self, parent, blocks):

+ # Check fr multiple items in one block.

+ items = self.get_items(blocks.pop(0))

+ sibling = self.lastChild(parent)

+ if sibling is not None and sibling.tag in self.SIBLING_TAGS:

+ # Previous block was a list item, so set that as parent

+ lst = sibling

+ # make sure previous item is in a p- if the item has text,

+ # then it isn't in a p

+ if lst[-1].text:

+ # since it's possible there are other children for this

+ # sibling, we can't just SubElement the p, we need to

+ # insert it as the first item.

+ p = util.etree.Element('p')

+ p.text = lst[-1].text

+ lst[-1].text = ''

+ lst[-1].insert(0, p)

+ # if the last item has a tail, then the tail needs to be put in a p

+ # likely only when a header is not followed by a blank line

+ lch = self.lastChild(lst[-1])

+ if lch is not None and lch.tail:

+ p = util.etree.SubElement(lst[-1], 'p')

+ p.text = lch.tail.lstrip()

+ lch.tail = ''

+ # parse first block differently as it gets wrapped in a p.

+ li = util.etree.SubElement(lst, 'li')

+ self.parser.state.set('looselist')

+ firstitem = items.pop(0)

+ self.parser.parseBlocks(li, [firstitem])

+ self.parser.state.reset()

+ elif parent.tag in ['ol', 'ul']:

+ # this catches the edge case of a multi-item indented list whose

+ # first item is in a blank parent-list item:

+ # * * subitem1

+ # * subitem2

+ # see also ListIndentProcessor

+ lst = parent

+ else:

+ # This is a new list so create parent with appropriate tag.

+ lst = util.etree.SubElement(parent, self.TAG)

+ # Check if a custom start integer is set

+ if not self.parser.markdown.lazy_ol and self.STARTSWITH != '1':

+ lst.attrib['start'] = self.STARTSWITH

+ self.parser.state.set('list')

+ # Loop through items in block, recursively parsing each with the

+ # appropriate parent.

+ for item in items:

+ if item.startswith(' '*self.tab_length):

+ # Item is indented. Parse with last item as parent

+ self.parser.parseBlocks(lst[-1], [item])

+ else:

+ # New item. Create li and parse with it as parent

+ li = util.etree.SubElement(lst, 'li')

+ self.parser.parseBlocks(li, [item])

+ self.parser.state.reset()

+ def get_items(self, block):

+ """ Break a block into list items. """

+ items = []

+ for line in block.split('\n'):

+ m = self.CHILD_RE.match(line)

+ if m:

+ # This is a new list item

+ # Check first item for the start index

+ if not items and self.TAG == 'ol':

+ # Detect the integer value of first list item

+ INTEGER_RE = re.compile('(\d+)')

+ self.STARTSWITH = INTEGER_RE.match(m.group(1)).group()

+ # Append to the list

+ items.append(m.group(3))

+ elif self.INDENT_RE.match(line):

+ # This is an indented (possibly nested) item.

+ if items[-1].startswith(' '*self.tab_length):

+ # Previous item was indented. Append to that item.

+ items[-1] = '%s\n%s' % (items[-1], line)

+ else:

+ items.append(line)

+ else:

+ # This is another line of previous item. Append to that item.

+ items[-1] = '%s\n%s' % (items[-1], line)

+ return items

+class UListProcessor(OListProcessor):

+ """ Process unordered list blocks. """

+ TAG = 'ul'

+ RE = re.compile(r'^[ ]{0,3}[*+-][ ]+(.*)')

+class HashHeaderProcessor(BlockProcessor):

+ """ Process Hash Headers. """

+ # Detect a header at start of any line in block

+ RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)')

+ def test(self, parent, block):

+ return bool(self.RE.search(block))

+ def run(self, parent, blocks):

+ block = blocks.pop(0)

+ m = self.RE.search(block)

+ if m:

+ before = block[:m.start()] # All lines before header

+ after = block[m.end():] # All lines after header

+ if before:

+ # As the header was not the first line of the block and the

+ # lines before the header must be parsed first,

+ # recursively parse this lines as a block.

+ self.parser.parseBlocks(parent, [before])

+ # Create header using named groups from RE

+ h = util.etree.SubElement(parent, 'h%d' % len(m.group('level')))

+ h.text = m.group('header').strip()

+ if after:

+ # Insert remaining lines as first block for future parsing.

+ blocks.insert(0, after)

+ else: # pragma: no cover

+ # This should never happen, but just in case...

+ logger.warn("We've got a problem header: %r" % block)

+class SetextHeaderProcessor(BlockProcessor):

+ """ Process Setext-style Headers. """

+ # Detect Setext-style header. Must be first 2 lines of block.

+ RE = re.compile(r'^.*?\n[=-]+[ ]*(\n|$)', re.MULTILINE)

+ def test(self, parent, block):

+ return bool(self.RE.match(block))

+ def run(self, parent, blocks):

+ lines = blocks.pop(0).split('\n')

+ # Determine level. ``=`` is 1 and ``-`` is 2.

+ if lines[1].startswith('='):

+ level = 1

+ else:

+ level = 2

+ h = util.etree.SubElement(parent, 'h%d' % level)

+ h.text = lines[0].strip()

+ if len(lines) > 2:

+ # Block contains additional lines. Add to master blocks for later.

+ blocks.insert(0, '\n'.join(lines[2:]))

+class HRProcessor(BlockProcessor):

+ """ Process Horizontal Rules. """

+ RE = r'^[ ]{0,3}((-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,})[ ]*'

+ # Detect hr on any line of a block.

+ SEARCH_RE = re.compile(RE, re.MULTILINE)

+ def test(self, parent, block):

+ m = self.SEARCH_RE.search(block)

+ # No atomic grouping in python so we simulate it here for performance.

+ # The regex only matches what would be in the atomic group - the HR.

+ # Then check if we are at end of block or if next char is a newline.

+ if m and (m.end() == len(block) or block[m.end()] == '\n'):

+ # Save match object on class instance so we can use it later.

+ self.match = m

+ return True

+ return False

+ def run(self, parent, blocks):

+ block = blocks.pop(0)

+ # Check for lines in block before hr.

+ prelines = block[:self.match.start()].rstrip('\n')

+ if prelines:

+ # Recursively parse lines before hr so they get parsed first.

+ self.parser.parseBlocks(parent, [prelines])

+ # create hr

+ util.etree.SubElement(parent, 'hr')

+ # check for lines in block after hr.

+ postlines = block[self.match.end():].lstrip('\n')

+ if postlines:

+ # Add lines after hr to master blocks for later parsing.

+ blocks.insert(0, postlines)

+class EmptyBlockProcessor(BlockProcessor):

+ """ Process blocks that are empty or start with an empty line. """

+ def test(self, parent, block):

+ return not block or block.startswith('\n')

+ def run(self, parent, blocks):

+ block = blocks.pop(0)

+ filler = '\n\n'

+ if block:

+ # Starts with empty line

+ # Only replace a single line.

+ filler = '\n'

+ # Save the rest for later.

+ theRest = block[1:]

+ if theRest:

+ # Add remaining lines to master blocks for later.

+ blocks.insert(0, theRest)

+ sibling = self.lastChild(parent)

+ if (sibling is not None and sibling.tag == 'pre' and

+ len(sibling) and sibling[0].tag == 'code'):

+ # Last block is a codeblock. Append to preserve whitespace.

+ sibling[0].text = util.AtomicString(

+ '%s%s' % (sibling[0].text, filler)

+ )

+class ParagraphProcessor(BlockProcessor):

+ """ Process Paragraph blocks. """

+ def test(self, parent, block):

+ return True

+ def run(self, parent, blocks):

+ block = blocks.pop(0)

+ if block.strip():

+ # Not a blank block. Add to parent, otherwise throw it away.

+ if self.parser.state.isstate('list'):

+ # The parent is a tight-list.

+ #

+ # Check for any children. This will likely only happen in a

+ # tight-list when a header isn't followed by a blank line.

+ # For example:

+ #

+ # * # Header

+ # Line 2 of list item - not part of header.

+ sibling = self.lastChild(parent)

+ if sibling is not None:

+ # Insetrt after sibling.

+ if sibling.tail:

+ sibling.tail = '%s\n%s' % (sibling.tail, block)

+ else:

+ sibling.tail = '\n%s' % block

+ else:

+ # Append to parent.text

+ if parent.text:

+ parent.text = '%s\n%s' % (parent.text, block)

+ else:

+ parent.text = block.lstrip()

+ else:

+ # Create a regular paragraph

+ p = util.etree.SubElement(parent, 'p')

+ p.text = block.lstrip()

« no previous file with comments | « third_party/Python-Markdown/markdown/blockparser.py ('k') | third_party/Python-Markdown/markdown/extensions/__init__.py » ('j') | no next file with comments »