utils/markdown/block_parser.dart - Issue 8680025: First pass at a markdown parser in Dart.

Side by Side Diff: utils/markdown/block_parser.dart

Issue 8680025: First pass at a markdown parser in Dart. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Review. Add missing file (oops!). Created 9 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file

	2 // for details. All rights reserved. Use of this source code is governed by a

	3 // BSD-style license that can be found in the LICENSE file.

	4

	5 /// The line contains only whitespace or is empty.

	6 final _RE_EMPTY = const RegExp(@'^([ \t]*)$');

	7

	8 /// A series of "=" or "-" (on the next line) define setext-style headers.

	9 final _RE_SETEXT = const RegExp(@'^((=+)\|(-+))$');

	10

	11 /// Leading (and trailing) "#" define atx-style headers.

	12 final _RE_HEADER = const RegExp(@'^(#{1,6})(.?)#$');

	13

	14 /// The line starts with ">" with one optional space after.

	15 final _RE_BLOCKQUOTE = const RegExp(@'^[ ]{0,3}>[ ]?(.*)$');

	16

	17 /// A line indented four spaces. Used for code blocks and lists.

	18 final _RE_INDENT = const RegExp(@'^(?: \|\t)(.*)$');

	19

	20 /// Three or more hyphens, asterisks or underscores by themselves. Note that

	21 /// a line like "----" is valid as both HR and SETEXT. In case of a tie,

	22 /// SETEXT should win.

	23 final _RE_HR = const RegExp(@'^[ ]{0,3}((-+[ ]{0,2}){3,}\|' +

	24 @'(_+[ ]{0,2}){3,}\|' +

	25 @'(\*+[ ]{0,2}){3,})$');

	26

	27 /// Really hacky way to detect block-level embedded HTML. Just looks for

	28 /// "<somename".

	29 final _RE_HTML = const RegExp(@'^<[ ]*\w+[ >]');

	30

	31 /// A line starting with one of these markers: "-", "*", "+". May have up to

	32 /// three leading spaces before the marker and any number of spaces or tabs

	33 /// after.

	34 final _RE_UL = const RegExp(@'^[ ]{0,3}[+-][ \t]+(.)$');

	35

	36 /// A line starting with a number like "123.". May have up to three leading

	37 /// spaces before the marker and any number of spaces or tabs after.

	38 final _RE_OL = const RegExp(@'^[ ]{0,3}\d+\.[ \t]+(.*)$');

	39

	40 /// Maintains the internal state needed to parse a series of lines into blocks

	41 /// of markdown suitable for further inline parsing.

	42 class BlockParser {

	43 final List<String> lines;

	44

	45 /// The markdown document this parser is parsing.

	46 final Document document;

	47

	48 /// Index of the current line.

	49 int pos;

	50

	51 BlockParser(this.lines, this.document)

	52 : pos = 0;

	53

	54 /// Gets the current line.

	55 String get current() => lines[pos];

	56

	57 /// Gets the line after the current one or `null` if there is none.

	58 String get next() {

	59 // Don't read past the end.

	60 if (pos >= lines.length - 1) return null;

	61 return lines[pos + 1];

	62 }

	63

	64 void advance() => pos++;

	65 bool get isDone() => pos >= lines.length;

	66

	67 /// Gets whether or not the current line matches the given pattern.

	68 bool matches(RegExp regex) {

	69 if (isDone) return false;

	70 return regex.firstMatch(current) != null;

	71 }

	72

	73 /// Gets whether or not the current line matches the given pattern.

	74 bool matchesNext(RegExp regex) {

	75 if (next == null) return false;

	76 return regex.firstMatch(next) != null;

	77 }

	78 }

	79

	80 class BlockSyntax {

	81 /// Gets the collection of built-in block parsers. To turn a series of lines

	82 /// into blocks, each of these will be tried in turn. Order matters here.

	83 static List<BlockSyntax> get syntaxes() {

	84 // Lazy initialize.

	85 if (_syntaxes == null) {

	86 _syntaxes = [

	87 new EmptyBlockSyntax(),

	88 new BlockHtmlSyntax(),

	89 new SetextHeaderSyntax(),

	90 new HeaderSyntax(),

	91 new CodeBlockSyntax(),

	92 new BlockquoteSyntax(),

	93 new HorizontalRuleSyntax(),

	94 new UnorderedListSyntax(),

	95 new OrderedListSyntax(),

	96 new ParagraphSyntax()

	97 ];

	98 }

	99

	100 return _syntaxes;

	101 }

	102

	103 static List<BlockSyntax> _syntaxes;

	104

	105 /// Gets the regex used to identify the beginning of this block, if any.

	106 RegExp get pattern() => null;

	107

	108 bool get canEndBlock() => true;

	109

	110 bool canParse(BlockParser parser) {

	111 return pattern.firstMatch(parser.current) != null;

	112 }

	113

	114 abstract Node parse(BlockParser parser);

	115

	116 List<Node> parseChildLines(BlockParser parser) {

	117 // Grab all of the lines that form the blockquote, stripping off the ">".

	118 final childLines = [];

	119

	120 while (!parser.isDone) {

	121 final match = pattern.firstMatch(parser.current);

	122 if (match == null) break;

	123 childLines.add(match.group(1));

	124 parser.advance();

	125 }

	126

	127 return childLines;

	128 }

	129

	130 /// Gets whether or not [parser]'s current line should end the previous block.

	131 static bool isAtBlockEnd(BlockParser parser) {

	132 if (parser.isDone) return true;

	133 return syntaxes.some((s) => s.canParse(parser) && s.canEndBlock);

	134 }

	135 }

	136

	137 class EmptyBlockSyntax extends BlockSyntax {

	138 RegExp get pattern() => _RE_EMPTY;

	139

	140 Node parse(BlockParser parser) {

	141 parser.advance();

	142

	143 // Don't actually emit anything.

	144 return null;

	145 }

	146 }

	147

	148 /// Parses setext-style headers.

	149 class SetextHeaderSyntax extends BlockSyntax {

	150 bool canParse(BlockParser parser) {

	151 // Note: matches next line, not the current one. We're looking for the

	152 // underlining after this line.

	153 return parser.matchesNext(_RE_SETEXT);

	154 }

	155

	156 Node parse(BlockParser parser) {

	157 final match = _RE_SETEXT.firstMatch(parser.next);

	158

	159 final tag = (match.group(1)[0] == '=') ? 'h1' : 'h2';

	160 final contents = parser.document.parseInline(parser.current);

	161 parser.advance();

	162 parser.advance();

	163

	164 return new Element(tag, contents);

	165 }

	166 }

	167

	168 /// Parses atx-style headers: "## Header ##".

	169 class HeaderSyntax extends BlockSyntax {

	170 RegExp get pattern() => _RE_HEADER;

	171

	172 Node parse(BlockParser parser) {

	173 final match = pattern.firstMatch(parser.current);

	174 parser.advance();

	175 final level = match.group(1).length;

	176 final contents = parser.document.parseInline(match.group(2).trim());

	177 return new Element('h$level', contents);

	178 }

	179 }

	180

	181 /// Parses email-style blockquotes: "> quote".

	182 class BlockquoteSyntax extends BlockSyntax {

	183 RegExp get pattern() => _RE_BLOCKQUOTE;

	184

	185 Node parse(BlockParser parser) {

	186 final childLines = parseChildLines(parser);

	187

	188 // Recursively parse the contents of the blockquote.

	189 final children = parser.document.parseLines(childLines);

	190

	191 return new Element('blockquote', children);

	192 }

	193 }

	194

	195 /// Parses preformatted code blocks that are indented four spaces.

	196 class CodeBlockSyntax extends BlockSyntax {

	197 RegExp get pattern() => _RE_INDENT;

	198

	199 Node parse(BlockParser parser) {

	200 final childLines = parseChildLines(parser);

	201

	202 // The Markdown tests expect a trailing newline.

	203 childLines.add('');

	204

	205 // Escape the code.

	206 final escaped = escapeHtml(Strings.join(childLines, '\n'));

	207

	208 return new Element('pre', [new Element.text('code', escaped)]);

	209 }

	210 }

	211

	212 /// Parses horizontal rules like "---", "_ _ _", "* * *", etc.

	213 class HorizontalRuleSyntax extends BlockSyntax {

	214 RegExp get pattern() => _RE_HR;

	215

	216 Node parse(BlockParser parser) {

	217 final match = pattern.firstMatch(parser.current);

	218 parser.advance();

	219 return new Element.empty('hr');

	220 }

	221 }

	222

	223 /// Parses inline HTML at the block level. This differs from other markdown

	224 /// implementations in several ways:

	225 ///

	226 /// 1. This one is way way WAY simpler.

	227 /// 2. All HTML tags at the block level will be treated as blocks. If you start

	228 /// a paragraph with <em>, it will not wrap it in a <p> for you. As soon as

	229 /// it sees something like HTML, it stops mucking with it until it hits the

	230 /// next block.

	231 /// 3. Absolutely no HTML parsing or validation is done. We're a markdown

	232 /// parser not an HTML parser!

	233 class BlockHtmlSyntax extends BlockSyntax {

	234 RegExp get pattern() => _RE_HTML;

	235

	236 bool get canEndBlock() => false;

	237

	238 Node parse(BlockParser parser) {

	239 final childLines = [];

	240

	241 // Eat until we hit a blank line.

	242 while (!parser.isDone && !parser.matches(_RE_EMPTY)) {

	243 childLines.add(parser.current);

	244 parser.advance();

	245 }

	246

	247 return new Text(Strings.join(childLines, '\n'));

	248 }

	249 }

	250

	251 class ListItem {

	252 bool forceBlock = false;

	253 final List<String> lines;

	254

	255 ListItem(this.lines);

	256 }

	257

	258 /// Base class for both ordered and unordered lists.

	259 class ListSyntax extends BlockSyntax {

	260 bool get canEndBlock() => false;

	261

	262 abstract String get listTag();

	263

	264 Node parse(BlockParser parser) {

	265 final items = <ListItem>[];

	266 var childLines = <String>[];

	267

	268 endItem() {

	269 if (childLines.length > 0) {

	270 items.add(new ListItem(childLines));

	271 childLines = <String>[];

	272 }

	273 }

	274

	275 var match;

	276 tryMatch(RegExp pattern) {

	277 match = pattern.firstMatch(parser.current);

	278 return match != null;

	279 }

	280

	281 bool afterEmpty = false;

	282 while (!parser.isDone) {

	283 if (tryMatch(_RE_EMPTY)) {

	284 // Add a blank line to the current list item.

	285 childLines.add('');

	286 } else if (tryMatch(_RE_UL) \|\| tryMatch(_RE_OL)) {

	287 // End the current list item and start a new one.

	288 endItem();

	289 childLines.add(match.group(1));

	290 } else if (tryMatch(_RE_INDENT)) {

	291 // Strip off indent and add to current item.

	292 childLines.add(match.group(1));

	293 } else if (isAtBlockEnd(parser)) {

	294 // Done with the list.

	295 break;

	296 } else {

	297 // Anything else is paragraph text or other stuff that can be in a list

	298 // item. However, if the previous item is a blank line, this means we're

	299 // done with the list and are starting a new top-level paragraph.

	300 if ((childLines.length > 0) && (childLines.last() == '')) break;

	301 childLines.add(parser.current);

	302 }

	303 parser.advance();

	304 }

	305

	306 endItem();

	307

	308 // Markdown, because it hates us, specifies two kinds of list items. If you

	309 // have a list like:

	310 //

	311 // * one

	312 // * two

	313 //

	314 // Then it will insert the conents of the lines directly in the <li>, like:

	315 // <ul>

	316 // <li>one</li>

	317 // <li>two</li>

	318 // <ul>

	319 //

	320 // If, however, there are blank lines between the items, each is wrapped in

	321 // paragraphs:

	322 //

	323 // * one

	324 //

	325 // * two

	326 //

	327 // <ul>

	328 // <li><p>one</p></li>

	329 // <li><p>two</p></li>

	330 // <ul>

	331 //

	332 // In other words, sometimes we parse the contents of a list item like a

	333 // block, and sometimes line an inline. The rules our parser implements are:

	334 //

	335 // - If it has more than one line, it's a block.

	336 // - If the line matches any block parser (BLOCKQUOTE, HEADER, HR, INDENT,

	337 // UL, OL) it's a block. (This is for cases like "* > quote".)

	338 // - If there was a blank line between this item and the previous one, it's

	339 // a block.

	340 // - If there was a blank line between this item and the next one, it's a

	341 // block.

	342 // - Otherwise, parse it as an inline.

	343

	344 // Remove any trailing empty lines and note which items are separated by

	345 // empty lines. Do this before seeing which items are single-line so that

	346 // trailing empty lines on the last item don't force it into being a block.

	347 for (int i = 0; i < items.length; i++) {

	348 for (int j = items[i].lines.length - 1; j > 0; j--) {

	349 if (_RE_EMPTY.firstMatch(items[i].lines[j]) != null) {

	350 // Found an empty line. Item and one after it are blocks.

	351 if (i < items.length - 1) {

	352 items[i].forceBlock = true;

	353 items[i + 1].forceBlock = true;

	354 }

	355 items[i].lines.removeLast();

	356 } else {

	357 break;

	358 }

	359 }

	360 }

	361

	362 // Convert the list items to Nodes.

	363 final itemNodes = <Node>[];

	364 for (final item in items) {

	365 bool blockItem = item.forceBlock \|\| (item.lines.length > 1);

	366

	367 // See if it matches some block parser.

	368 final blocksInList = const [

	369 _RE_BLOCKQUOTE,

	370 _RE_HEADER,

	371 _RE_HR,

	372 _RE_INDENT,

	373 _RE_UL,

	374 _RE_OL

	375 ];

	376

	377 if (!blockItem) {

	378 for (final pattern in blocksInList) {

	379 if (pattern.firstMatch(item.lines[0]) != null) {

	380 blockItem = true;

	381 break;

	382 }

	383 }

	384 }

	385

	386 // Parse the item as a block or inline.

	387 if (blockItem) {

	388 // Block list item.

	389 final children = parser.document.parseLines(item.lines);

	390 itemNodes.add(new Element('li', children));

	391 } else {

	392 // Raw list item.

	393 final contents = parser.document.parseInline(item.lines[0]);

	394 itemNodes.add(new Element('li', contents));

	395 }

	396 }

	397

	398 return new Element(listTag, itemNodes);

	399 }

	400 }

	401

	402 /// Parses unordered lists.

	403 class UnorderedListSyntax extends ListSyntax {

	404 RegExp get pattern() => _RE_UL;

	405 String get listTag() => 'ul';

	406 }

	407

	408 /// Parses ordered lists.

	409 class OrderedListSyntax extends ListSyntax {

	410 RegExp get pattern() => _RE_OL;

	411 String get listTag() => 'ol';

	412 }

	413

	414 /// Parses paragraphs of regular text.

	415 class ParagraphSyntax extends BlockSyntax {

	416 bool get canEndBlock() => false;

	417

	418 bool canParse(BlockParser parser) => true;

	419

	420 Node parse(BlockParser parser) {

	421 final childLines = [];

	422

	423 // Eat until we hit something that ends a paragraph.

	424 while (!isAtBlockEnd(parser)) {

	425 childLines.add(parser.current);

	426 parser.advance();

	427 }

	428

	429 final contents = parser.document.parseInline(

	430 Strings.join(childLines, '\n'));

	431 return new Element('p', contents);

	432 }

	433 }

OLD	NEW

« no previous file with comments | « utils/markdown/ast.dart ('k') | utils/markdown/html_renderer.dart » ('j') | no next file with comments »