| Index: utils/markdown/block_parser.dart
|
| diff --git a/utils/markdown/block_parser.dart b/utils/markdown/block_parser.dart
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..d80c9e95c649aefe6ef928478a31fdfb2e4e2fbb
|
| --- /dev/null
|
| +++ b/utils/markdown/block_parser.dart
|
| @@ -0,0 +1,433 @@
|
| +// Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
|
| +// for details. All rights reserved. Use of this source code is governed by a
|
| +// BSD-style license that can be found in the LICENSE file.
|
| +
|
| +/// The line contains only whitespace or is empty.
|
| +final _RE_EMPTY = const RegExp(@'^([ \t]*)$');
|
| +
|
| +/// A series of "=" or "-" (on the next line) define setext-style headers.
|
| +final _RE_SETEXT = const RegExp(@'^((=+)|(-+))$');
|
| +
|
| +/// Leading (and trailing) "#" define atx-style headers.
|
| +final _RE_HEADER = const RegExp(@'^(#{1,6})(.*?)#*$');
|
| +
|
| +/// The line starts with ">" with one optional space after.
|
| +final _RE_BLOCKQUOTE = const RegExp(@'^[ ]{0,3}>[ ]?(.*)$');
|
| +
|
| +/// A line indented four spaces. Used for code blocks and lists.
|
| +final _RE_INDENT = const RegExp(@'^(?: |\t)(.*)$');
|
| +
|
| +/// Three or more hyphens, asterisks or underscores by themselves. Note that
|
| +/// a line like "----" is valid as both HR and SETEXT. In case of a tie,
|
| +/// SETEXT should win.
|
| +final _RE_HR = const RegExp(@'^[ ]{0,3}((-+[ ]{0,2}){3,}|' +
|
| + @'(_+[ ]{0,2}){3,}|' +
|
| + @'(\*+[ ]{0,2}){3,})$');
|
| +
|
| +/// Really hacky way to detect block-level embedded HTML. Just looks for
|
| +/// "<somename".
|
| +final _RE_HTML = const RegExp(@'^<[ ]*\w+[ >]');
|
| +
|
| +/// A line starting with one of these markers: "-", "*", "+". May have up to
|
| +/// three leading spaces before the marker and any number of spaces or tabs
|
| +/// after.
|
| +final _RE_UL = const RegExp(@'^[ ]{0,3}[*+-][ \t]+(.*)$');
|
| +
|
| +/// A line starting with a number like "123.". May have up to three leading
|
| +/// spaces before the marker and any number of spaces or tabs after.
|
| +final _RE_OL = const RegExp(@'^[ ]{0,3}\d+\.[ \t]+(.*)$');
|
| +
|
| +/// Maintains the internal state needed to parse a series of lines into blocks
|
| +/// of markdown suitable for further inline parsing.
|
| +class BlockParser {
|
| + final List<String> lines;
|
| +
|
| + /// The markdown document this parser is parsing.
|
| + final Document document;
|
| +
|
| + /// Index of the current line.
|
| + int pos;
|
| +
|
| + BlockParser(this.lines, this.document)
|
| + : pos = 0;
|
| +
|
| + /// Gets the current line.
|
| + String get current() => lines[pos];
|
| +
|
| + /// Gets the line after the current one or `null` if there is none.
|
| + String get next() {
|
| + // Don't read past the end.
|
| + if (pos >= lines.length - 1) return null;
|
| + return lines[pos + 1];
|
| + }
|
| +
|
| + void advance() => pos++;
|
| + bool get isDone() => pos >= lines.length;
|
| +
|
| + /// Gets whether or not the current line matches the given pattern.
|
| + bool matches(RegExp regex) {
|
| + if (isDone) return false;
|
| + return regex.firstMatch(current) != null;
|
| + }
|
| +
|
| + /// Gets whether or not the current line matches the given pattern.
|
| + bool matchesNext(RegExp regex) {
|
| + if (next == null) return false;
|
| + return regex.firstMatch(next) != null;
|
| + }
|
| +}
|
| +
|
| +class BlockSyntax {
|
| + /// Gets the collection of built-in block parsers. To turn a series of lines
|
| + /// into blocks, each of these will be tried in turn. Order matters here.
|
| + static List<BlockSyntax> get syntaxes() {
|
| + // Lazy initialize.
|
| + if (_syntaxes == null) {
|
| + _syntaxes = [
|
| + new EmptyBlockSyntax(),
|
| + new BlockHtmlSyntax(),
|
| + new SetextHeaderSyntax(),
|
| + new HeaderSyntax(),
|
| + new CodeBlockSyntax(),
|
| + new BlockquoteSyntax(),
|
| + new HorizontalRuleSyntax(),
|
| + new UnorderedListSyntax(),
|
| + new OrderedListSyntax(),
|
| + new ParagraphSyntax()
|
| + ];
|
| + }
|
| +
|
| + return _syntaxes;
|
| + }
|
| +
|
| + static List<BlockSyntax> _syntaxes;
|
| +
|
| + /// Gets the regex used to identify the beginning of this block, if any.
|
| + RegExp get pattern() => null;
|
| +
|
| + bool get canEndBlock() => true;
|
| +
|
| + bool canParse(BlockParser parser) {
|
| + return pattern.firstMatch(parser.current) != null;
|
| + }
|
| +
|
| + abstract Node parse(BlockParser parser);
|
| +
|
| + List<Node> parseChildLines(BlockParser parser) {
|
| + // Grab all of the lines that form the blockquote, stripping off the ">".
|
| + final childLines = [];
|
| +
|
| + while (!parser.isDone) {
|
| + final match = pattern.firstMatch(parser.current);
|
| + if (match == null) break;
|
| + childLines.add(match.group(1));
|
| + parser.advance();
|
| + }
|
| +
|
| + return childLines;
|
| + }
|
| +
|
| + /// Gets whether or not [parser]'s current line should end the previous block.
|
| + static bool isAtBlockEnd(BlockParser parser) {
|
| + if (parser.isDone) return true;
|
| + return syntaxes.some((s) => s.canParse(parser) && s.canEndBlock);
|
| + }
|
| +}
|
| +
|
| +class EmptyBlockSyntax extends BlockSyntax {
|
| + RegExp get pattern() => _RE_EMPTY;
|
| +
|
| + Node parse(BlockParser parser) {
|
| + parser.advance();
|
| +
|
| + // Don't actually emit anything.
|
| + return null;
|
| + }
|
| +}
|
| +
|
| +/// Parses setext-style headers.
|
| +class SetextHeaderSyntax extends BlockSyntax {
|
| + bool canParse(BlockParser parser) {
|
| + // Note: matches *next* line, not the current one. We're looking for the
|
| + // underlining after this line.
|
| + return parser.matchesNext(_RE_SETEXT);
|
| + }
|
| +
|
| + Node parse(BlockParser parser) {
|
| + final match = _RE_SETEXT.firstMatch(parser.next);
|
| +
|
| + final tag = (match.group(1)[0] == '=') ? 'h1' : 'h2';
|
| + final contents = parser.document.parseInline(parser.current);
|
| + parser.advance();
|
| + parser.advance();
|
| +
|
| + return new Element(tag, contents);
|
| + }
|
| +}
|
| +
|
| +/// Parses atx-style headers: "## Header ##".
|
| +class HeaderSyntax extends BlockSyntax {
|
| + RegExp get pattern() => _RE_HEADER;
|
| +
|
| + Node parse(BlockParser parser) {
|
| + final match = pattern.firstMatch(parser.current);
|
| + parser.advance();
|
| + final level = match.group(1).length;
|
| + final contents = parser.document.parseInline(match.group(2).trim());
|
| + return new Element('h$level', contents);
|
| + }
|
| +}
|
| +
|
| +/// Parses email-style blockquotes: "> quote".
|
| +class BlockquoteSyntax extends BlockSyntax {
|
| + RegExp get pattern() => _RE_BLOCKQUOTE;
|
| +
|
| + Node parse(BlockParser parser) {
|
| + final childLines = parseChildLines(parser);
|
| +
|
| + // Recursively parse the contents of the blockquote.
|
| + final children = parser.document.parseLines(childLines);
|
| +
|
| + return new Element('blockquote', children);
|
| + }
|
| +}
|
| +
|
| +/// Parses preformatted code blocks that are indented four spaces.
|
| +class CodeBlockSyntax extends BlockSyntax {
|
| + RegExp get pattern() => _RE_INDENT;
|
| +
|
| + Node parse(BlockParser parser) {
|
| + final childLines = parseChildLines(parser);
|
| +
|
| + // The Markdown tests expect a trailing newline.
|
| + childLines.add('');
|
| +
|
| + // Escape the code.
|
| + final escaped = escapeHtml(Strings.join(childLines, '\n'));
|
| +
|
| + return new Element('pre', [new Element.text('code', escaped)]);
|
| + }
|
| +}
|
| +
|
| +/// Parses horizontal rules like "---", "_ _ _", "* * *", etc.
|
| +class HorizontalRuleSyntax extends BlockSyntax {
|
| + RegExp get pattern() => _RE_HR;
|
| +
|
| + Node parse(BlockParser parser) {
|
| + final match = pattern.firstMatch(parser.current);
|
| + parser.advance();
|
| + return new Element.empty('hr');
|
| + }
|
| +}
|
| +
|
| +/// Parses inline HTML at the block level. This differs from other markdown
|
| +/// implementations in several ways:
|
| +///
|
| +/// 1. This one is way way WAY simpler.
|
| +/// 2. All HTML tags at the block level will be treated as blocks. If you start
|
| +/// a paragraph with <em>, it will not wrap it in a <p> for you. As soon as
|
| +/// it sees something like HTML, it stops mucking with it until it hits the
|
| +/// next block.
|
| +/// 3. Absolutely no HTML parsing or validation is done. We're a markdown
|
| +/// parser not an HTML parser!
|
| +class BlockHtmlSyntax extends BlockSyntax {
|
| + RegExp get pattern() => _RE_HTML;
|
| +
|
| + bool get canEndBlock() => false;
|
| +
|
| + Node parse(BlockParser parser) {
|
| + final childLines = [];
|
| +
|
| + // Eat until we hit a blank line.
|
| + while (!parser.isDone && !parser.matches(_RE_EMPTY)) {
|
| + childLines.add(parser.current);
|
| + parser.advance();
|
| + }
|
| +
|
| + return new Text(Strings.join(childLines, '\n'));
|
| + }
|
| +}
|
| +
|
| +class ListItem {
|
| + bool forceBlock = false;
|
| + final List<String> lines;
|
| +
|
| + ListItem(this.lines);
|
| +}
|
| +
|
| +/// Base class for both ordered and unordered lists.
|
| +class ListSyntax extends BlockSyntax {
|
| + bool get canEndBlock() => false;
|
| +
|
| + abstract String get listTag();
|
| +
|
| + Node parse(BlockParser parser) {
|
| + final items = <ListItem>[];
|
| + var childLines = <String>[];
|
| +
|
| + endItem() {
|
| + if (childLines.length > 0) {
|
| + items.add(new ListItem(childLines));
|
| + childLines = <String>[];
|
| + }
|
| + }
|
| +
|
| + var match;
|
| + tryMatch(RegExp pattern) {
|
| + match = pattern.firstMatch(parser.current);
|
| + return match != null;
|
| + }
|
| +
|
| + bool afterEmpty = false;
|
| + while (!parser.isDone) {
|
| + if (tryMatch(_RE_EMPTY)) {
|
| + // Add a blank line to the current list item.
|
| + childLines.add('');
|
| + } else if (tryMatch(_RE_UL) || tryMatch(_RE_OL)) {
|
| + // End the current list item and start a new one.
|
| + endItem();
|
| + childLines.add(match.group(1));
|
| + } else if (tryMatch(_RE_INDENT)) {
|
| + // Strip off indent and add to current item.
|
| + childLines.add(match.group(1));
|
| + } else if (isAtBlockEnd(parser)) {
|
| + // Done with the list.
|
| + break;
|
| + } else {
|
| + // Anything else is paragraph text or other stuff that can be in a list
|
| + // item. However, if the previous item is a blank line, this means we're
|
| + // done with the list and are starting a new top-level paragraph.
|
| + if ((childLines.length > 0) && (childLines.last() == '')) break;
|
| + childLines.add(parser.current);
|
| + }
|
| + parser.advance();
|
| + }
|
| +
|
| + endItem();
|
| +
|
| + // Markdown, because it hates us, specifies two kinds of list items. If you
|
| + // have a list like:
|
| + //
|
| + // * one
|
| + // * two
|
| + //
|
| + // Then it will insert the conents of the lines directly in the <li>, like:
|
| + // <ul>
|
| + // <li>one</li>
|
| + // <li>two</li>
|
| + // <ul>
|
| + //
|
| + // If, however, there are blank lines between the items, each is wrapped in
|
| + // paragraphs:
|
| + //
|
| + // * one
|
| + //
|
| + // * two
|
| + //
|
| + // <ul>
|
| + // <li><p>one</p></li>
|
| + // <li><p>two</p></li>
|
| + // <ul>
|
| + //
|
| + // In other words, sometimes we parse the contents of a list item like a
|
| + // block, and sometimes line an inline. The rules our parser implements are:
|
| + //
|
| + // - If it has more than one line, it's a block.
|
| + // - If the line matches any block parser (BLOCKQUOTE, HEADER, HR, INDENT,
|
| + // UL, OL) it's a block. (This is for cases like "* > quote".)
|
| + // - If there was a blank line between this item and the previous one, it's
|
| + // a block.
|
| + // - If there was a blank line between this item and the next one, it's a
|
| + // block.
|
| + // - Otherwise, parse it as an inline.
|
| +
|
| + // Remove any trailing empty lines and note which items are separated by
|
| + // empty lines. Do this before seeing which items are single-line so that
|
| + // trailing empty lines on the last item don't force it into being a block.
|
| + for (int i = 0; i < items.length; i++) {
|
| + for (int j = items[i].lines.length - 1; j > 0; j--) {
|
| + if (_RE_EMPTY.firstMatch(items[i].lines[j]) != null) {
|
| + // Found an empty line. Item and one after it are blocks.
|
| + if (i < items.length - 1) {
|
| + items[i].forceBlock = true;
|
| + items[i + 1].forceBlock = true;
|
| + }
|
| + items[i].lines.removeLast();
|
| + } else {
|
| + break;
|
| + }
|
| + }
|
| + }
|
| +
|
| + // Convert the list items to Nodes.
|
| + final itemNodes = <Node>[];
|
| + for (final item in items) {
|
| + bool blockItem = item.forceBlock || (item.lines.length > 1);
|
| +
|
| + // See if it matches some block parser.
|
| + final blocksInList = const [
|
| + _RE_BLOCKQUOTE,
|
| + _RE_HEADER,
|
| + _RE_HR,
|
| + _RE_INDENT,
|
| + _RE_UL,
|
| + _RE_OL
|
| + ];
|
| +
|
| + if (!blockItem) {
|
| + for (final pattern in blocksInList) {
|
| + if (pattern.firstMatch(item.lines[0]) != null) {
|
| + blockItem = true;
|
| + break;
|
| + }
|
| + }
|
| + }
|
| +
|
| + // Parse the item as a block or inline.
|
| + if (blockItem) {
|
| + // Block list item.
|
| + final children = parser.document.parseLines(item.lines);
|
| + itemNodes.add(new Element('li', children));
|
| + } else {
|
| + // Raw list item.
|
| + final contents = parser.document.parseInline(item.lines[0]);
|
| + itemNodes.add(new Element('li', contents));
|
| + }
|
| + }
|
| +
|
| + return new Element(listTag, itemNodes);
|
| + }
|
| +}
|
| +
|
| +/// Parses unordered lists.
|
| +class UnorderedListSyntax extends ListSyntax {
|
| + RegExp get pattern() => _RE_UL;
|
| + String get listTag() => 'ul';
|
| +}
|
| +
|
| +/// Parses ordered lists.
|
| +class OrderedListSyntax extends ListSyntax {
|
| + RegExp get pattern() => _RE_OL;
|
| + String get listTag() => 'ol';
|
| +}
|
| +
|
| +/// Parses paragraphs of regular text.
|
| +class ParagraphSyntax extends BlockSyntax {
|
| + bool get canEndBlock() => false;
|
| +
|
| + bool canParse(BlockParser parser) => true;
|
| +
|
| + Node parse(BlockParser parser) {
|
| + final childLines = [];
|
| +
|
| + // Eat until we hit something that ends a paragraph.
|
| + while (!isAtBlockEnd(parser)) {
|
| + childLines.add(parser.current);
|
| + parser.advance();
|
| + }
|
| +
|
| + final contents = parser.document.parseInline(
|
| + Strings.join(childLines, '\n'));
|
| + return new Element('p', contents);
|
| + }
|
| +}
|
|
|