utils/markdown/inline_parser.dart - Issue 8680025: First pass at a markdown parser in Dart.

Side by Side Diff: utils/markdown/inline_parser.dart

Issue 8680025: First pass at a markdown parser in Dart. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 9 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file

	2 // for details. All rights reserved. Use of this source code is governed by a

	3 // BSD-style license that can be found in the LICENSE file.

	4

	5 /// Maintains the internal state needed to parse inline span elements in

	6 /// markdown.

	7 class InlineParser {

	8 static List<InlineSyntax> get syntaxes() {

	9 // Lazy initialize.

	10 if (_syntaxes == null) {

	11 _syntaxes = <InlineSyntax>[

	12 new AutolinkSyntax(),
	Jennifer Messerly 2011/11/23 22:25:41 const ctors? const ctors? Bob Nystrom 2011/11/29 02:56:29 See similar comment on block parser. Here I also r Show quoted text On 2011/11/23 22:25:41, John Messerly wrote: > const ctors? See similar comment on block parser. Here I also ran into an issue where I think some syntaxes were building their pattern in the constructor using "+" on strings and the VM wasn't allowing it as a const expression.
	13 new LinkSyntax(),

	14 // "*" surrounded by spaces is left alone.

	15 new TextSyntax(@' \* '),

	16 // "_" surrounded by spaces is left alone.

	17 new TextSyntax(@' _ '),

	18 // Leave already-encoded HTML entities alone. Ensures we don't turn

	19 // "&" into "&amp;"

	20 new TextSyntax(@'&[#a-zA-Z0-9]*;'),

	21 // Encode "&".

	22 new TextSyntax(@'&', sub: '&'),

	23 // Encode "<". (Why not encode ">" too? Gruber is toying with us.)

	24 new TextSyntax(@'<', sub: '<'),

	25 // Parse "strong" tags.

	26 new TagSyntax(@'\\', tag: 'strong'),

	27 // Parse "__strong__" tags.

	28 new TagSyntax(@'__', tag: 'strong'),

	29 // Parse "emphasis" tags.

	30 new TagSyntax(@'\*', tag: 'em'),

	31 // Parse "_emphasis_" tags.

	32 // TODO(rnystrom): Underscores in the middle of a word should not be

	33 // parsed as emphasis like_in_this.

	34 new TagSyntax(@'_', tag: 'em'),

	35 // Parse inline code within double backticks: "``code``".

	36 new CodeSyntax(@'``[ ]?(.*?)[ ]?``'),

	37 // Parse inline code within backticks: "`code`".

	38 new CodeSyntax(@'`([^`]*)`')

	39 ];

	40 }

	41

	42 return _syntaxes;

	43 }

	44

	45 static List<InlineSyntax> _syntaxes;

	46

	47 /// The string of markdown being parsed.

	48 final String source;

	49

	50 /// The markdown document this parser is parsing.

	51 final Document document;

	52

	53 /// The current read position.

	54 int pos = 0;

	55

	56 /// Starting position of the last unconsumed text.

	57 int start = 0;

	58

	59 final List<TagState> _stack;

	60

	61 InlineParser(this.source, this.document)

	62 : _stack = <TagState>[];

	63

	64 List<Node> parse() {

	65 // Make a fake top tag to hold the results.

	66 _stack.add(new TagState(0, null));

	67

	68 while (!isDone) {

	69 bool matched = false;

	70

	71 // See if any of the current tags on the stack match. We don't allow tags

	72 // of the same kind to nest, so this takes priority over other possible // matches.

	73 for (int i = _stack.length - 1; i > 0; i--) {

	74 if (_stack[i].tryMatch(this)) {

	75 matched = true;

	76 break;

	77 }

	78 }

	79 if (matched) continue;

	80

	81 // See if the current text matches any defined markdown syntax.

	82 for (final syntax in syntaxes) {

	83 if (syntax.tryMatch(this)) {

	84 matched = true;

	85 break;

	86 }

	87 }

	88 if (matched) continue;

	89

	90 // If we got here, it's just text.

	91 advanceBy(1);

	92 }

	93

	94 // Unwind any unmatched tags and get the results.

	95 return _stack[0].close(this, null);

	96 }

	97

	98 writeText() {

	99 if (pos > start) {

	100 final text = source.substring(start, pos);

	101 final nodes = _stack.last().children;

	102

	103 // If the previous node is text too, just append.

	104 if ((nodes.length > 0) && (nodes.last() is Text)) {

	105 final newNode = new Text('${nodes.last().text}$text');

	106 nodes[nodes.length - 1] = newNode;

	107 } else {

	108 nodes.add(new Text(text));

	109 }

	110

	111 start = pos;

	112 }

	113 }

	114

	115 /// Removes the top tag from the stack, reverts it to plain text and adds it

	116 /// to the output.

	117 discardUnmatchedTag() {

	118 final unfinished = _stack.removeLast();

	119 start = unfinished.startPos;

	120 }

	121

	122 addNode(Node node) {

	123 _stack.last().children.add(node);

	124 }

	125

	126 // TODO(rnystrom): Only need this because RegExp doesn't let you start

	127 // searching from a given offset.
	Jennifer Messerly 2011/11/23 22:25:41 yeah... that seriously needs to be fixed in RegExp yeah... that seriously needs to be fixed in RegExp. mind filing an issue? Bob Nystrom 2011/11/29 02:56:29 Yeah. There's a few things in RegExp that are anno Show quoted text On 2011/11/23 22:25:41, John Messerly wrote: > yeah... that seriously needs to be fixed in RegExp. mind filing an issue? Yeah. There's a few things in RegExp that are annoying now that I've really put it through its paces.
	128 String get currentSource() => source.substring(pos, source.length);

	129

	130 bool get isDone() => pos == source.length;

	131

	132 void advanceBy(int length) => pos += length;

	133 void consume(int length) {

	134 pos += length;

	135 start = pos;

	136 }

	137 }

	138

	139 /// Represents one kind of markdown tag that can be parsed.

	140 class InlineSyntax {

	141 final RegExp pattern;

	142

	143 InlineSyntax(String pattern)

	144 : pattern = new RegExp(pattern, true);

	145 // TODO(rnystrom): Should use named arg for RegExp multiLine.

	146

	147 bool tryMatch(InlineParser parser) {

	148 final startMatch = pattern.firstMatch(parser.currentSource);

	149 if ((startMatch != null) && (startMatch.start() == 0)) {

	150 // Write any existing plain text up to this point.

	151 parser.writeText();

	152

	153 if (onMatch(parser, startMatch)) {

	154 parser.consume(startMatch.group(0).length);

	155 }

	156 return true;

	157 }

	158 return false;

	159 }

	160

	161 abstract bool match(InlineParser parser, Match match);

	162 }

	163

	164 /// Matches stuff that should just be passed through as straight text.

	165 class TextSyntax extends InlineSyntax {

	166 String substitute;

	167 TextSyntax(String pattern, [String sub])

	168 : super(pattern),

	169 substitute = sub;

	170

	171 bool onMatch(InlineParser parser, Match match) {

	172 if (substitute == null) {

	173 // Just use the original matched text.

	174 parser.advanceBy(match.group(0).length);

	175 return false;

	176 }

	177

	178 // Insert the substitution.

	179 parser.addNode(new Text(substitute));

	180 return true;

	181 }

	182 }

	183

	184 /// Matches autolinks like <http://foo.com>.

	185 class AutolinkSyntax extends InlineSyntax {

	186 AutolinkSyntax()

	187 : super(@'<((http\|https\|ftp)://[^>]*)>');

	188 // TODO(rnystrom): Make case insensitive.

	189

	190 bool onMatch(InlineParser parser, Match match) {

	191 final url = match.group(1);

	192

	193 final anchor = new Element.text('a', escapeHtml(url));

	194 anchor.attributes['href'] = url;

	195 parser.addNode(anchor);

	196

	197 return true;

	198 }

	199 }

	200

	201 /// Matches syntax that has a pair of tags and becomes an element, like '*' for

	202 /// `<em>`. Allows nested tags.

	203 class TagSyntax extends InlineSyntax {

	204 final RegExp endPattern;

	205 final String tag;

	206

	207 TagSyntax(String pattern, [String tag, String end = null])

	208 : super(pattern),

	209 endPattern = new RegExp((end != null) ? end : pattern, true),

	210 tag = tag;

	211 // TODO(rnystrom): Doing this.field doesn't seem to work with named args.
	Jennifer Messerly 2011/11/23 22:25:41 what's the issue here? can you file to the issue t what's the issue here? can you file to the issue tracker? Bob Nystrom 2011/11/29 02:56:29 I think this might be the same issue that Mattias Show quoted text On 2011/11/23 22:25:41, John Messerly wrote: > what's the issue here? can you file to the issue tracker? I think this might be the same issue that Mattias was working on yesterday. Filed a bug just in case: https://code.google.com/p/dart/issues/detail?id=588
	212 // TODO(rnystrom): Should use named arg for RegExp multiLine.

	213

	214 bool onMatch(InlineParser parser, Match match) {

	215 parser._stack.add(new TagState(parser.pos, this));

	216 return true;

	217 }

	218

	219 bool onMatchEnd(InlineParser parser, Match match, TagState state) {

	220 parser.addNode(new Element(tag, state.children));

	221 return true;

	222 }

	223 }

	224

	225 /// Matches inline links like [blah] [id] and [blah] (url).

	226 class LinkSyntax extends TagSyntax {

	227 /// The regex for the end of a link needs to handle both reference style and

	228 /// inline styles as well as optional titles for inline links. To make that

	229 /// a bit more palatable, this breaks it into pieces.

	230 static get linkPattern() {
	Jennifer Messerly 2011/11/23 22:25:41 could this be a field? or does the string interp b could this be a field? or does the string interp break const-ness? Bob Nystrom 2011/11/29 02:56:29 It was breaking constness when I tried that. Show quoted text On 2011/11/23 22:25:41, John Messerly wrote: > could this be a field? or does the string interp break const-ness? It was breaking constness when I tried that.
	231 final bracket = @'\][ \n\t]?'; // "]" with optional space after.

	232 final refLink = @'\[([^\]]*)\]'; // "[id]" reflink id.

	233 final title = @'(?:[ ]*"([^"]+)"\|)'; // Optional title in quotes.

	234 final inlineLink = '\$([^ )]+)$title\$'; // "(url "title")" inline link.

	235 return '$bracket(?:$refLink\|$inlineLink)';

	236 }

	237

	238 LinkSyntax()

	239 : super(@'\[', end: linkPattern);

	240

	241 bool onMatchEnd(InlineParser parser, Match match, TagState state) {

	242 var url;

	243 var title;

	244

	245 if (match.group(2) != '') {

	246 // Inline link like [foo](url).

	247 url = match.group(2);

	248 title = match.group(3);

	249

	250 // For whatever reason, markdown allows angle-bracketed URLs here.

	251 if (url.startsWith('<') && url.endsWith('>')) {

	252 url = url.substring(1, url.length - 1);

	253 }

	254 } else {

	255 // Reference link like [foo] [bar].

	256 var id = match.group(1);

	257 if (id == '') {

	258 // The id is empty ("[]") so infer it from the contents.

	259 id = parser.source.substring(state.startPos + 1, parser.pos);

	260 }

	261

	262 // Look up the link.

	263 final link = parser.document.refLinks[id];

	264 // If it's an unknown link just emit plaintext.

	265 if (link == null) return false;

	266

	267 url = link.url;

	268 title = link.title;

	269 }

	270

	271 final anchor = new Element('a', state.children);

	272 anchor.attributes['href'] = escapeHtml(url);

	273 if ((title != null) && (title != '')) {

	274 anchor.attributes['title'] = escapeHtml(title);

	275 }

	276

	277 parser.addNode(anchor);

	278 return true;

	279 }

	280 }

	281

	282 /// Matches backtick-enclosed inline code blocks.

	283 class CodeSyntax extends InlineSyntax {

	284 CodeSyntax(String pattern)

	285 : super(pattern);

	286

	287 bool onMatch(InlineParser parser, Match match) {

	288 parser.addNode(new Element.text('code', escapeHtml(match.group(1))));

	289 return true;

	290 }

	291 }

	292

	293 /// Keeps track of a currently open tag while it is being parsed. The parser

	294 /// maintains a stack of these so it can handle nested tags.

	295 class TagState {

	296 /// The point in the original source where this tag started.

	297 int startPos;

	298

	299 /// The syntax that created this node.

	300 final TagSyntax syntax;

	301

	302 /// The children of this node. Will be `null` for text nodes.

	303 final List<Node> children;

	304

	305 TagState(this.startPos, this.syntax)

	306 : children = <Node>[];

	307

	308 /// Attempts to close this tag by matching the current text against its end

	309 /// pattern.

	310 bool tryMatch(InlineParser parser) {

	311 Match endMatch = syntax.endPattern.firstMatch(parser.currentSource);

	312 if ((endMatch != null) && (endMatch.start() == 0)) {

	313 // Close the tag.

	314 close(parser, endMatch);

	315 return true;

	316 }

	317

	318 return false;

	319 }

	320

	321 /// Pops this tag off the stack, completes it, and adds it to the output.

	322 /// Will discard any unmatched tags that happen to be above it on the stack.

	323 /// If this is the last node in the stack, returns its children.

	324 List<Node> close(InlineParser parser, Match endMatch) {

	325 // Found a match. If there is anything above this tag on the stack,

	326 // discard it. For example, given 'a _b...' when we reach the second

	327 // '*', '_' will be on the top of the stack. It's mismatched, so we

	328 // just treat it as text.

	329 while (parser._stack.last() != this) parser.discardUnmatchedTag();

	330

	331 // Pop this off the stack.

	332 parser.writeText();

	333 parser._stack.removeLast();

	334

	335 // If the stack is empty now, this is the special "results" node.

	336 if (parser._stack.length == 0) return children;

	337

	338 // We are still parsing, so add this to its parent's children.

	339 if (syntax.onMatchEnd(parser, endMatch, this)) {

	340 parser.consume(endMatch.group(0).length);

	341 } else {

	342 // Didn't close correctly so revert to text.

	343 parser.start = startPos;

	344 parser.advanceBy(endMatch.group(0).length);

	345 }

	346

	347 return null;

	348 }

	349 }

OLD	NEW

« utils/markdown/block_parser.dart ('K') | « utils/markdown/html_renderer.dart ('k') | utils/markdown/markdown.dart » ('j') | utils/markdown/markdown.dart » ('J')