utils/markdown/inline_parser.dart - Issue 8680025: First pass at a markdown parser in Dart.

Unified Diff: utils/markdown/inline_parser.dart

Issue 8680025: First pass at a markdown parser in Dart. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 9 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: utils/markdown/inline_parser.dart

diff --git a/utils/markdown/inline_parser.dart b/utils/markdown/inline_parser.dart

new file mode 100644

index 0000000000000000000000000000000000000000..8bf7693d81d8050abac5cec43a8cb8aecafc1010

--- /dev/null

+++ b/utils/markdown/inline_parser.dart

@@ -0,0 +1,349 @@

+// BSD-style license that can be found in the LICENSE file.

+/// Maintains the internal state needed to parse inline span elements in

+/// markdown.

+class InlineParser {

+ static List<InlineSyntax> get syntaxes() {

+ // Lazy initialize.

+ if (_syntaxes == null) {

+ _syntaxes = <InlineSyntax>[

+ new AutolinkSyntax(),

Jennifer Messerly 2011/11/23 22:25:41 const ctors?

Bob Nystrom 2011/11/29 02:56:29 See similar comment on block parser. Here I also r

+ new LinkSyntax(),

+ // "*" surrounded by spaces is left alone.

+ new TextSyntax(@' \* '),

+ // "_" surrounded by spaces is left alone.

+ new TextSyntax(@' _ '),

+ // Leave already-encoded HTML entities alone. Ensures we don't turn

+ // "&" into "&amp;"

+ new TextSyntax(@'&[#a-zA-Z0-9]*;'),

+ // Encode "&".

+ new TextSyntax(@'&', sub: '&'),

+ // Encode "<". (Why not encode ">" too? Gruber is toying with us.)

+ new TextSyntax(@'<', sub: '<'),

+ // Parse "**strong**" tags.

+ new TagSyntax(@'\*\*', tag: 'strong'),

+ // Parse "__strong__" tags.

+ new TagSyntax(@'__', tag: 'strong'),

+ // Parse "*emphasis*" tags.

+ new TagSyntax(@'\*', tag: 'em'),

+ // Parse "_emphasis_" tags.

+ // TODO(rnystrom): Underscores in the middle of a word should not be

+ // parsed as emphasis like_in_this.

+ new TagSyntax(@'_', tag: 'em'),

+ // Parse inline code within double backticks: "``code``".

+ new CodeSyntax(@'``[ ]?(.*?)[ ]?``'),

+ // Parse inline code within backticks: "`code`".

+ new CodeSyntax(@'`([^`]*)`')

+ ];

+ }

+ return _syntaxes;

+ }

+ static List<InlineSyntax> _syntaxes;

+ /// The string of markdown being parsed.

+ final String source;

+ /// The markdown document this parser is parsing.

+ final Document document;

+ /// The current read position.

+ int pos = 0;

+ /// Starting position of the last unconsumed text.

+ int start = 0;

+ final List<TagState> _stack;

+ InlineParser(this.source, this.document)

+ : _stack = <TagState>[];

+ List<Node> parse() {

+ // Make a fake top tag to hold the results.

+ _stack.add(new TagState(0, null));

+ while (!isDone) {

+ bool matched = false;

+ // See if any of the current tags on the stack match. We don't allow tags

+ // of the same kind to nest, so this takes priority over other possible // matches.

+ for (int i = _stack.length - 1; i > 0; i--) {

+ if (_stack[i].tryMatch(this)) {

+ matched = true;

+ break;

+ }

+ if (matched) continue;

+ // See if the current text matches any defined markdown syntax.

+ for (final syntax in syntaxes) {

+ if (syntax.tryMatch(this)) {

+ matched = true;

+ break;

+ }

+ if (matched) continue;

+ // If we got here, it's just text.

+ advanceBy(1);

+ }

+ // Unwind any unmatched tags and get the results.

+ return _stack[0].close(this, null);

+ }

+ writeText() {

+ if (pos > start) {

+ final text = source.substring(start, pos);

+ final nodes = _stack.last().children;

+ // If the previous node is text too, just append.

+ if ((nodes.length > 0) && (nodes.last() is Text)) {

+ final newNode = new Text('${nodes.last().text}$text');

+ nodes[nodes.length - 1] = newNode;

+ } else {

+ nodes.add(new Text(text));

+ }

+ start = pos;

+ }

+ /// Removes the top tag from the stack, reverts it to plain text and adds it

+ /// to the output.

+ discardUnmatchedTag() {

+ final unfinished = _stack.removeLast();

+ start = unfinished.startPos;

+ }

+ addNode(Node node) {

+ _stack.last().children.add(node);

+ }

+ // TODO(rnystrom): Only need this because RegExp doesn't let you start

+ // searching from a given offset.

Jennifer Messerly 2011/11/23 22:25:41 yeah... that seriously needs to be fixed in RegExp

Bob Nystrom 2011/11/29 02:56:29 Yeah. There's a few things in RegExp that are anno

+ String get currentSource() => source.substring(pos, source.length);

+ bool get isDone() => pos == source.length;

+ void advanceBy(int length) => pos += length;

+ void consume(int length) {

+ pos += length;

+ start = pos;

+ }

+/// Represents one kind of markdown tag that can be parsed.

+class InlineSyntax {

+ final RegExp pattern;

+ InlineSyntax(String pattern)

+ : pattern = new RegExp(pattern, true);

+ // TODO(rnystrom): Should use named arg for RegExp multiLine.

+ bool tryMatch(InlineParser parser) {

+ final startMatch = pattern.firstMatch(parser.currentSource);

+ if ((startMatch != null) && (startMatch.start() == 0)) {

+ // Write any existing plain text up to this point.

+ parser.writeText();

+ if (onMatch(parser, startMatch)) {

+ parser.consume(startMatch.group(0).length);

+ }

+ return true;

+ }

+ return false;

+ }

+ abstract bool match(InlineParser parser, Match match);

+/// Matches stuff that should just be passed through as straight text.

+class TextSyntax extends InlineSyntax {

+ String substitute;

+ TextSyntax(String pattern, [String sub])

+ : super(pattern),

+ substitute = sub;

+ bool onMatch(InlineParser parser, Match match) {

+ if (substitute == null) {

+ // Just use the original matched text.

+ parser.advanceBy(match.group(0).length);

+ return false;

+ }

+ // Insert the substitution.

+ parser.addNode(new Text(substitute));

+ return true;

+ }

+/// Matches autolinks like <http://foo.com>.

+class AutolinkSyntax extends InlineSyntax {

+ AutolinkSyntax()

+ : super(@'<((http|https|ftp)://[^>]*)>');

+ // TODO(rnystrom): Make case insensitive.

+ bool onMatch(InlineParser parser, Match match) {

+ final url = match.group(1);

+ final anchor = new Element.text('a', escapeHtml(url));

+ anchor.attributes['href'] = url;

+ parser.addNode(anchor);

+ return true;

+ }

+/// Matches syntax that has a pair of tags and becomes an element, like '*' for

+/// `<em>`. Allows nested tags.

+class TagSyntax extends InlineSyntax {

+ final RegExp endPattern;

+ final String tag;

+ TagSyntax(String pattern, [String tag, String end = null])

+ : super(pattern),

+ endPattern = new RegExp((end != null) ? end : pattern, true),

+ tag = tag;

+ // TODO(rnystrom): Doing this.field doesn't seem to work with named args.

Jennifer Messerly 2011/11/23 22:25:41 what's the issue here? can you file to the issue t

Bob Nystrom 2011/11/29 02:56:29 I think this might be the same issue that Mattias

+ // TODO(rnystrom): Should use named arg for RegExp multiLine.

+ bool onMatch(InlineParser parser, Match match) {

+ parser._stack.add(new TagState(parser.pos, this));

+ return true;

+ }

+ bool onMatchEnd(InlineParser parser, Match match, TagState state) {

+ parser.addNode(new Element(tag, state.children));

+ return true;

+ }

+/// Matches inline links like [blah] [id] and [blah] (url).

+class LinkSyntax extends TagSyntax {

+ /// The regex for the end of a link needs to handle both reference style and

+ /// inline styles as well as optional titles for inline links. To make that

+ /// a bit more palatable, this breaks it into pieces.

+ static get linkPattern() {

Jennifer Messerly 2011/11/23 22:25:41 could this be a field? or does the string interp b

Bob Nystrom 2011/11/29 02:56:29 It was breaking constness when I tried that.

+ final bracket = @'\][ \n\t]?'; // "]" with optional space after.

+ final refLink = @'\[([^\]]*)\]'; // "[id]" reflink id.

+ final title = @'(?:[ ]*"([^"]+)"|)'; // Optional title in quotes.

+ final inlineLink = '\$([^ )]+)$title\$'; // "(url "title")" inline link.

+ return '$bracket(?:$refLink|$inlineLink)';

+ }

+ LinkSyntax()

+ : super(@'\[', end: linkPattern);

+ bool onMatchEnd(InlineParser parser, Match match, TagState state) {

+ var url;

+ var title;

+ if (match.group(2) != '') {

+ // Inline link like [foo](url).

+ url = match.group(2);

+ title = match.group(3);

+ // For whatever reason, markdown allows angle-bracketed URLs here.

+ if (url.startsWith('<') && url.endsWith('>')) {

+ url = url.substring(1, url.length - 1);

+ }

+ } else {

+ // Reference link like [foo] [bar].

+ var id = match.group(1);

+ if (id == '') {

+ // The id is empty ("[]") so infer it from the contents.

+ id = parser.source.substring(state.startPos + 1, parser.pos);

+ }

+ // Look up the link.

+ final link = parser.document.refLinks[id];

+ // If it's an unknown link just emit plaintext.

+ if (link == null) return false;

+ url = link.url;

+ title = link.title;

+ }

+ final anchor = new Element('a', state.children);

+ anchor.attributes['href'] = escapeHtml(url);

+ if ((title != null) && (title != '')) {

+ anchor.attributes['title'] = escapeHtml(title);

+ }

+ parser.addNode(anchor);

+ return true;

+ }

+/// Matches backtick-enclosed inline code blocks.

+class CodeSyntax extends InlineSyntax {

+ CodeSyntax(String pattern)

+ : super(pattern);

+ bool onMatch(InlineParser parser, Match match) {

+ parser.addNode(new Element.text('code', escapeHtml(match.group(1))));

+ return true;

+ }

+/// Keeps track of a currently open tag while it is being parsed. The parser

+/// maintains a stack of these so it can handle nested tags.

+class TagState {

+ /// The point in the original source where this tag started.

+ int startPos;

+ /// The syntax that created this node.

+ final TagSyntax syntax;

+ /// The children of this node. Will be `null` for text nodes.

+ final List<Node> children;

+ TagState(this.startPos, this.syntax)

+ : children = <Node>[];

+ /// Attempts to close this tag by matching the current text against its end

+ /// pattern.

+ bool tryMatch(InlineParser parser) {

+ Match endMatch = syntax.endPattern.firstMatch(parser.currentSource);

+ if ((endMatch != null) && (endMatch.start() == 0)) {

+ // Close the tag.

+ close(parser, endMatch);

+ return true;

+ }

+ return false;

+ }

+ /// Pops this tag off the stack, completes it, and adds it to the output.

+ /// Will discard any unmatched tags that happen to be above it on the stack.

+ /// If this is the last node in the stack, returns its children.

+ List<Node> close(InlineParser parser, Match endMatch) {

+ // Found a match. If there is anything above this tag on the stack,

+ // discard it. For example, given '*a _b*...' when we reach the second

+ // '*', '_' will be on the top of the stack. It's mismatched, so we

+ // just treat it as text.

+ while (parser._stack.last() != this) parser.discardUnmatchedTag();

+ // Pop this off the stack.

+ parser.writeText();

+ parser._stack.removeLast();

+ // If the stack is empty now, this is the special "results" node.

+ if (parser._stack.length == 0) return children;

+ // We are still parsing, so add this to its parent's children.

+ if (syntax.onMatchEnd(parser, endMatch, this)) {

+ parser.consume(endMatch.group(0).length);

+ } else {

+ // Didn't close correctly so revert to text.

+ parser.start = startPos;

+ parser.advanceBy(endMatch.group(0).length);

+ }

+ return null;

+ }

« utils/markdown/block_parser.dart ('K') | « utils/markdown/html_renderer.dart ('k') | utils/markdown/markdown.dart » ('j') | utils/markdown/markdown.dart » ('J')