utils/markdown/block_parser.dart - Issue 8680025: First pass at a markdown parser in Dart.

Unified Diff: utils/markdown/block_parser.dart

Issue 8680025: First pass at a markdown parser in Dart. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Review. Add missing file (oops!). Created 9 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: utils/markdown/block_parser.dart

diff --git a/utils/markdown/block_parser.dart b/utils/markdown/block_parser.dart

new file mode 100644

index 0000000000000000000000000000000000000000..d80c9e95c649aefe6ef928478a31fdfb2e4e2fbb

--- /dev/null

+++ b/utils/markdown/block_parser.dart

@@ -0,0 +1,433 @@

+// BSD-style license that can be found in the LICENSE file.

+/// The line contains only whitespace or is empty.

+final _RE_EMPTY = const RegExp(@'^([ \t]*)$');

+/// A series of "=" or "-" (on the next line) define setext-style headers.

+final _RE_SETEXT = const RegExp(@'^((=+)|(-+))$');

+/// Leading (and trailing) "#" define atx-style headers.

+final _RE_HEADER = const RegExp(@'^(#{1,6})(.*?)#*$');

+/// The line starts with ">" with one optional space after.

+final _RE_BLOCKQUOTE = const RegExp(@'^[ ]{0,3}>[ ]?(.*)$');

+/// A line indented four spaces. Used for code blocks and lists.

+final _RE_INDENT = const RegExp(@'^(?: |\t)(.*)$');

+/// Three or more hyphens, asterisks or underscores by themselves. Note that

+/// a line like "----" is valid as both HR and SETEXT. In case of a tie,

+/// SETEXT should win.

+final _RE_HR = const RegExp(@'^[ ]{0,3}((-+[ ]{0,2}){3,}|' +

+ @'(_+[ ]{0,2}){3,}|' +

+ @'(\*+[ ]{0,2}){3,})$');

+/// Really hacky way to detect block-level embedded HTML. Just looks for

+/// "<somename".

+final _RE_HTML = const RegExp(@'^<[ ]*\w+[ >]');

+/// A line starting with one of these markers: "-", "*", "+". May have up to

+/// three leading spaces before the marker and any number of spaces or tabs

+/// after.

+final _RE_UL = const RegExp(@'^[ ]{0,3}[*+-][ \t]+(.*)$');

+/// A line starting with a number like "123.". May have up to three leading

+/// spaces before the marker and any number of spaces or tabs after.

+final _RE_OL = const RegExp(@'^[ ]{0,3}\d+\.[ \t]+(.*)$');

+/// Maintains the internal state needed to parse a series of lines into blocks

+/// of markdown suitable for further inline parsing.

+class BlockParser {

+ final List<String> lines;

+ /// The markdown document this parser is parsing.

+ final Document document;

+ /// Index of the current line.

+ int pos;

+ BlockParser(this.lines, this.document)

+ : pos = 0;

+ /// Gets the current line.

+ String get current() => lines[pos];

+ /// Gets the line after the current one or `null` if there is none.

+ String get next() {

+ // Don't read past the end.

+ if (pos >= lines.length - 1) return null;

+ return lines[pos + 1];

+ }

+ void advance() => pos++;

+ bool get isDone() => pos >= lines.length;

+ /// Gets whether or not the current line matches the given pattern.

+ bool matches(RegExp regex) {

+ if (isDone) return false;

+ return regex.firstMatch(current) != null;

+ }

+ /// Gets whether or not the current line matches the given pattern.

+ bool matchesNext(RegExp regex) {

+ if (next == null) return false;

+ return regex.firstMatch(next) != null;

+ }

+class BlockSyntax {

+ /// Gets the collection of built-in block parsers. To turn a series of lines

+ /// into blocks, each of these will be tried in turn. Order matters here.

+ static List<BlockSyntax> get syntaxes() {

+ // Lazy initialize.

+ if (_syntaxes == null) {

+ _syntaxes = [

+ new EmptyBlockSyntax(),

+ new BlockHtmlSyntax(),

+ new SetextHeaderSyntax(),

+ new HeaderSyntax(),

+ new CodeBlockSyntax(),

+ new BlockquoteSyntax(),

+ new HorizontalRuleSyntax(),

+ new UnorderedListSyntax(),

+ new OrderedListSyntax(),

+ new ParagraphSyntax()

+ ];

+ }

+ return _syntaxes;

+ }

+ static List<BlockSyntax> _syntaxes;

+ /// Gets the regex used to identify the beginning of this block, if any.

+ RegExp get pattern() => null;

+ bool get canEndBlock() => true;

+ bool canParse(BlockParser parser) {

+ return pattern.firstMatch(parser.current) != null;

+ }

+ abstract Node parse(BlockParser parser);

+ List<Node> parseChildLines(BlockParser parser) {

+ // Grab all of the lines that form the blockquote, stripping off the ">".

+ final childLines = [];

+ while (!parser.isDone) {

+ final match = pattern.firstMatch(parser.current);

+ if (match == null) break;

+ childLines.add(match.group(1));

+ parser.advance();

+ }

+ return childLines;

+ }

+ /// Gets whether or not [parser]'s current line should end the previous block.

+ static bool isAtBlockEnd(BlockParser parser) {

+ if (parser.isDone) return true;

+ return syntaxes.some((s) => s.canParse(parser) && s.canEndBlock);

+ }

+class EmptyBlockSyntax extends BlockSyntax {

+ RegExp get pattern() => _RE_EMPTY;

+ Node parse(BlockParser parser) {

+ parser.advance();

+ // Don't actually emit anything.

+ return null;

+ }

+/// Parses setext-style headers.

+class SetextHeaderSyntax extends BlockSyntax {

+ bool canParse(BlockParser parser) {

+ // Note: matches *next* line, not the current one. We're looking for the

+ // underlining after this line.

+ return parser.matchesNext(_RE_SETEXT);

+ }

+ Node parse(BlockParser parser) {

+ final match = _RE_SETEXT.firstMatch(parser.next);

+ final tag = (match.group(1)[0] == '=') ? 'h1' : 'h2';

+ final contents = parser.document.parseInline(parser.current);

+ parser.advance();

+ return new Element(tag, contents);

+ }

+/// Parses atx-style headers: "## Header ##".

+class HeaderSyntax extends BlockSyntax {

+ RegExp get pattern() => _RE_HEADER;

+ Node parse(BlockParser parser) {

+ final match = pattern.firstMatch(parser.current);

+ parser.advance();

+ final level = match.group(1).length;

+ final contents = parser.document.parseInline(match.group(2).trim());

+ return new Element('h$level', contents);

+ }

+/// Parses email-style blockquotes: "> quote".

+class BlockquoteSyntax extends BlockSyntax {

+ RegExp get pattern() => _RE_BLOCKQUOTE;

+ Node parse(BlockParser parser) {

+ final childLines = parseChildLines(parser);

+ // Recursively parse the contents of the blockquote.

+ final children = parser.document.parseLines(childLines);

+ return new Element('blockquote', children);

+ }

+/// Parses preformatted code blocks that are indented four spaces.

+class CodeBlockSyntax extends BlockSyntax {

+ RegExp get pattern() => _RE_INDENT;

+ Node parse(BlockParser parser) {

+ final childLines = parseChildLines(parser);

+ // The Markdown tests expect a trailing newline.

+ childLines.add('');

+ // Escape the code.

+ final escaped = escapeHtml(Strings.join(childLines, '\n'));

+ return new Element('pre', [new Element.text('code', escaped)]);

+ }

+/// Parses horizontal rules like "---", "_ _ _", "* * *", etc.

+class HorizontalRuleSyntax extends BlockSyntax {

+ RegExp get pattern() => _RE_HR;

+ Node parse(BlockParser parser) {

+ final match = pattern.firstMatch(parser.current);

+ parser.advance();

+ return new Element.empty('hr');

+ }

+/// Parses inline HTML at the block level. This differs from other markdown

+/// implementations in several ways:

+///

+/// 1. This one is way way WAY simpler.

+/// 2. All HTML tags at the block level will be treated as blocks. If you start

+/// a paragraph with , it will not wrap it in a for you. As soon as

+/// it sees something like HTML, it stops mucking with it until it hits the

+/// next block.

+/// 3. Absolutely no HTML parsing or validation is done. We're a markdown

+/// parser not an HTML parser!

+class BlockHtmlSyntax extends BlockSyntax {

+ RegExp get pattern() => _RE_HTML;

+ bool get canEndBlock() => false;

+ Node parse(BlockParser parser) {

+ final childLines = [];

+ // Eat until we hit a blank line.

+ while (!parser.isDone && !parser.matches(_RE_EMPTY)) {

+ childLines.add(parser.current);

+ parser.advance();

+ }

+ return new Text(Strings.join(childLines, '\n'));

+ }

+class ListItem {

+ bool forceBlock = false;

+ final List<String> lines;

+ ListItem(this.lines);

+/// Base class for both ordered and unordered lists.

+class ListSyntax extends BlockSyntax {

+ bool get canEndBlock() => false;

+ abstract String get listTag();

+ Node parse(BlockParser parser) {

+ final items = <ListItem>[];

+ var childLines = <String>[];

+ endItem() {

+ if (childLines.length > 0) {

+ items.add(new ListItem(childLines));

+ childLines = <String>[];

+ }

+ var match;

+ tryMatch(RegExp pattern) {

+ match = pattern.firstMatch(parser.current);

+ return match != null;

+ }

+ bool afterEmpty = false;

+ while (!parser.isDone) {

+ if (tryMatch(_RE_EMPTY)) {

+ // Add a blank line to the current list item.

+ childLines.add('');

+ } else if (tryMatch(_RE_UL) || tryMatch(_RE_OL)) {

+ // End the current list item and start a new one.

+ endItem();

+ childLines.add(match.group(1));

+ } else if (tryMatch(_RE_INDENT)) {

+ // Strip off indent and add to current item.

+ childLines.add(match.group(1));

+ } else if (isAtBlockEnd(parser)) {

+ // Done with the list.

+ break;

+ } else {

+ // Anything else is paragraph text or other stuff that can be in a list

+ // item. However, if the previous item is a blank line, this means we're

+ // done with the list and are starting a new top-level paragraph.

+ if ((childLines.length > 0) && (childLines.last() == '')) break;

+ childLines.add(parser.current);

+ }

+ parser.advance();

+ }

+ endItem();

+ // Markdown, because it hates us, specifies two kinds of list items. If you

+ // have a list like:

+ //

+ // * one

+ // * two

+ //

+ // Then it will insert the conents of the lines directly in the <li>, like:

+ // <ul>

+ // <li>one</li>

+ // <li>two</li>

+ // <ul>

+ //

+ // If, however, there are blank lines between the items, each is wrapped in

+ // paragraphs:

+ //

+ // * one

+ //

+ // * two

+ //

+ // <ul>

+ // <li>one</li>

+ // <li>two</li>

+ // <ul>

+ //

+ // In other words, sometimes we parse the contents of a list item like a

+ // block, and sometimes line an inline. The rules our parser implements are:

+ //

+ // - If it has more than one line, it's a block.

+ // - If the line matches any block parser (BLOCKQUOTE, HEADER, HR, INDENT,

+ // UL, OL) it's a block. (This is for cases like "* > quote".)

+ // - If there was a blank line between this item and the previous one, it's

+ // a block.

+ // - If there was a blank line between this item and the next one, it's a

+ // block.

+ // - Otherwise, parse it as an inline.

+ // Remove any trailing empty lines and note which items are separated by

+ // empty lines. Do this before seeing which items are single-line so that

+ // trailing empty lines on the last item don't force it into being a block.

+ for (int i = 0; i < items.length; i++) {

+ for (int j = items[i].lines.length - 1; j > 0; j--) {

+ if (_RE_EMPTY.firstMatch(items[i].lines[j]) != null) {

+ // Found an empty line. Item and one after it are blocks.

+ if (i < items.length - 1) {

+ items[i].forceBlock = true;

+ items[i + 1].forceBlock = true;

+ }

+ items[i].lines.removeLast();

+ } else {

+ break;

+ }

+ // Convert the list items to Nodes.

+ final itemNodes = <Node>[];

+ for (final item in items) {

+ bool blockItem = item.forceBlock || (item.lines.length > 1);

+ // See if it matches some block parser.

+ final blocksInList = const [

+ _RE_BLOCKQUOTE,

+ _RE_HEADER,

+ _RE_HR,

+ _RE_INDENT,

+ _RE_UL,

+ _RE_OL

+ ];

+ if (!blockItem) {

+ for (final pattern in blocksInList) {

+ if (pattern.firstMatch(item.lines[0]) != null) {

+ blockItem = true;

+ break;

+ }

+ // Parse the item as a block or inline.

+ if (blockItem) {

+ // Block list item.

+ final children = parser.document.parseLines(item.lines);

+ itemNodes.add(new Element('li', children));

+ } else {

+ // Raw list item.

+ final contents = parser.document.parseInline(item.lines[0]);

+ itemNodes.add(new Element('li', contents));

+ }

+ return new Element(listTag, itemNodes);

+ }

+/// Parses unordered lists.

+class UnorderedListSyntax extends ListSyntax {

+ RegExp get pattern() => _RE_UL;

+ String get listTag() => 'ul';

+/// Parses ordered lists.

+class OrderedListSyntax extends ListSyntax {

+ RegExp get pattern() => _RE_OL;

+ String get listTag() => 'ol';

+/// Parses paragraphs of regular text.

+class ParagraphSyntax extends BlockSyntax {

+ bool get canEndBlock() => false;

+ bool canParse(BlockParser parser) => true;

+ Node parse(BlockParser parser) {

+ final childLines = [];

+ // Eat until we hit something that ends a paragraph.

+ while (!isAtBlockEnd(parser)) {

+ childLines.add(parser.current);

+ parser.advance();

+ }

+ final contents = parser.document.parseInline(

+ Strings.join(childLines, '\n'));

+ return new Element('p', contents);

+ }

« no previous file with comments | « utils/markdown/ast.dart ('k') | utils/markdown/html_renderer.dart » ('j') | no next file with comments »