utils/markdown/block_parser.dart - Issue 8680025: First pass at a markdown parser in Dart.

Side by Side Diff: utils/markdown/block_parser.dart

Issue 8680025: First pass at a markdown parser in Dart. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 9 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file

	2 // for details. All rights reserved. Use of this source code is governed by a

	3 // BSD-style license that can be found in the LICENSE file.

	4

	5 class _Re {
	Jennifer Messerly 2011/11/23 22:25:41 what is this Java doing in my Dart code? :) Serio what is this Java doing in my Dart code? :) Seriously though can these be top level? Actually... after reading further in the code, maybe these are just members of the classes that use them? If it's only used once it seems better to just inline it in the pattern() => getter. Bob Nystrom 2011/11/29 02:56:29 Reorganized. I don't know what I was thinking. The Show quoted text On 2011/11/23 22:25:41, John Messerly wrote: > what is this Java doing in my Dart code? :) > > Seriously though can these be top level? Actually... after reading further in > the code, maybe these are just members of the classes that use them? > > If it's only used once it seems better to just inline it in the pattern() => > getter. Reorganized. I don't know what I was thinking. They aren't directly in the classes because ListParser (sigh) uses almost all of them in addition to them being used by their appropriate parsers.
	6 /// The line contains only whitespace or is empty.

	7 static final EMPTY = const RegExp(@'^([ \t]*)$');

	8

	9 /// A series of "=" or "-" (on the next line) define setext-style headers.

	10 static final SETEXT = const RegExp(@'^((=+)\|(-+))$');

	11

	12 /// Leading (and trailing) "#" define atx-style headers.

	13 static final HEADER = const RegExp(@'^(#{1,6})(.?)#$');

	14

	15 /// The line starts with ">" with one optional space after.

	16 static final BLOCKQUOTE = const RegExp(@'^[ ]{0,3}>[ ]?(.*)$');

	17

	18 /// A line indented four spaces. Used for code blocks and lists.

	19 static final INDENT = const RegExp(@'^(?: \|\t)(.*)$');

	20

	21 /// Three or more hyphens, asterisks or underscores by themselves. Note that

	22 /// a line like "----" is valid as both HR and SETEXT. In case of a tie,

	23 /// SETEXT should win.

	24 static final HR = const RegExp(@'^[ ]{0,3}((-+[ ]{0,2}){3,}\|' +

	25 @'(_+[ ]{0,2}){3,}\|' +

	26 @'(\*+[ ]{0,2}){3,})$');

	27

	28 /// Really hacky way to detect block-level embedded HTML. Just looks for

	29 /// "<somename".

	30 static final HTML = const RegExp(@'^<[ ]*\w+[ >]');

	31

	32 /// A line starting with one of these markers: "-", "*", "+". May have up to

	33 /// three leading spaces before the marker and any number of spaces or tabs

	34 /// after.

	35 static final UL = const RegExp(@'^[ ]{0,3}[+-][ \t]+(.)$');

	36

	37 /// A line starting with a number like "123.". May have up to three leading

	38 /// spaces before the marker and any number of spaces or tabs after.

	39 static final OL = const RegExp(@'^[ ]{0,3}\d+\.[ \t]+(.*)$');

	40

	41 /// These patterns when appearing in a single-line list item will force the

	42 /// item to be parsed as a block.

	43 static final BLOCKS_IN_LIST = const [BLOCKQUOTE, HEADER, HR, INDENT, UL, OL];

	44 }

	45

	46 /// Maintains the internal state needed to parse a series of lines into blocks

	47 /// of markdown suitable for further inline parsing.

	48 class BlockParser {

	49 final List<String> lines;

	50

	51 /// The markdown document this parser is parsing.

	52 final Document document;

	53

	54 /// Index of the current line.

	55 int pos;

	56

	57 BlockParser(this.lines, this.document)

	58 : pos = 0;

	59

	60 /// Gets the current line.

	61 String get current() => lines[pos];

	62

	63 /// Gets the line after the current one or `null` if there is none.

	64 String get next() {

	65 // Don't read past the end.

	66 if (pos >= lines.length - 1) return null;

	67 return lines[pos + 1];

	68 }

	69

	70 void advance() => pos++;

	71 bool get isDone() => pos >= lines.length;

	72

	73 /// Gets whether or not the current line matches the given pattern.

	74 bool matches(RegExp regex) {

	75 if (isDone) return false;

	76 return regex.firstMatch(current) != null;

	77 }

	78

	79 /// Gets whether or not the current line matches the given pattern.

	80 bool matchesNext(RegExp regex) {

	81 if (next == null) return false;

	82 return regex.firstMatch(next) != null;

	83 }

	84 }

	85

	86 class BlockSyntax {

	87 /// Gets the collection of built-in block parsers. To turn a series of lines

	88 /// into blocks, each of these will be tried in turn. Order matters here.

	89 static List<BlockSyntax> get syntaxes() {

	90 // Lazy initialize.

	91 if (_syntaxes == null) {

	92 _syntaxes = [

	93 new EmptyBlockSyntax(),
	Jennifer Messerly 2011/11/23 22:25:41 could these be const, and then use a "static final could these be const, and then use a "static final" list? Bob Nystrom 2011/11/29 02:56:29 They could be, but I'm thinking users may be able Show quoted text On 2011/11/23 22:25:41, John Messerly wrote: > could these be const, and then use a "static final" list? They could be, but I'm thinking users may be able to insert their own block parsers in this collection in order to extend the markdown syntax for their own needs. I figured it might be handy to have it mutable. (Also, it saves me the chore of having to write const constructors for every syntax class.)
	94 new BlockHtmlSyntax(),

	95 new SetextHeaderSyntax(),

	96 new HeaderSyntax(),

	97 new CodeBlockSyntax(),

	98 new BlockquoteSyntax(),

	99 new HorizontalRuleSyntax(),

	100 new UnorderedListSyntax(),

	101 new OrderedListSyntax(),

	102 new ParagraphSyntax()

	103 ];

	104 }

	105

	106 return _syntaxes;

	107 }

	108

	109 static List<BlockSyntax> _syntaxes;

	110

	111 /// Gets the regex used to identify the beginning of this block, if any.

	112 RegExp get pattern() => null;

	113

	114 bool get canEndBlock() => true;

	115

	116 bool canParse(BlockParser parser) {

	117 return pattern.firstMatch(parser.current) != null;

	118 }

	119

	120 abstract Node parse(BlockParser parser);

	121

	122 List<Node> parseChildLines(BlockParser parser) {

	123 // Grab all of the lines that form the blockquote, stripping off the ">".

	124 final childLines = [];

	125

	126 while (!parser.isDone) {

	127 final match = pattern.firstMatch(parser.current);

	128 if (match == null) break;

	129 childLines.add(match.group(1));

	130 parser.advance();

	131 }

	132

	133 return childLines;

	134 }

	135

	136 /// Gets whether or not [parser]'s current line should end the previous block.

	137 static bool isAtBlockEnd(BlockParser parser) {

	138 if (parser.isDone) return true;

	139 return syntaxes.some((s) => s.canParse(parser) && s.canEndBlock);

	140 }

	141 }

	142

	143 class EmptyBlockSyntax extends BlockSyntax {

	144 RegExp get pattern() => _Re.EMPTY;

	145

	146 Node parse(BlockParser parser) {

	147 parser.advance();

	148

	149 // Don't actually emit anything.

	150 return null;

	151 }

	152 }

	153

	154 /// Parses setext-style headers.

	155 class SetextHeaderSyntax extends BlockSyntax {

	156 bool canParse(BlockParser parser) {

	157 // Note: matches next line, not the current one. We're looking for the

	158 // underlining after this line.

	159 return parser.matchesNext(_Re.SETEXT);

	160 }

	161

	162 Node parse(BlockParser parser) {

	163 final match = _Re.SETEXT.firstMatch(parser.next);

	164

	165 final tag = (match.group(1)[0] == '=') ? 'h1' : 'h2';

	166 final contents = parser.document.parseInline(parser.current);

	167 parser.advance();

	168 parser.advance();

	169

	170 return new Element(tag, contents);

	171 }

	172 }

	173

	174 /// Parses atx-style headers: "## Header ##".

	175 class HeaderSyntax extends BlockSyntax {

	176 RegExp get pattern() => _Re.HEADER;

	177

	178 Node parse(BlockParser parser) {

	179 final match = pattern.firstMatch(parser.current);

	180 parser.advance();

	181 final level = match.group(1).length;

	182 final contents = parser.document.parseInline(match.group(2).trim());

	183 return new Element('h$level', contents);

	184 }

	185 }

	186

	187 /// Parses email-style blockquotes: "> quote".

	188 class BlockquoteSyntax extends BlockSyntax {

	189 RegExp get pattern() => _Re.BLOCKQUOTE;

	190

	191 Node parse(BlockParser parser) {

	192 final childLines = parseChildLines(parser);

	193

	194 // Recursively parse the contents of the blockquote.

	195 final children = parser.document.parseLines(childLines);

	196

	197 return new Element('blockquote', children);

	198 }

	199 }

	200

	201 /// Parses preformatted code blocks that are indented four spaces.

	202 class CodeBlockSyntax extends BlockSyntax {

	203 RegExp get pattern() => _Re.INDENT;

	204

	205 Node parse(BlockParser parser) {

	206 final childLines = parseChildLines(parser);

	207

	208 // The Markdown tests expect a trailing newline.

	209 childLines.add('');

	210

	211 // Escape the code.

	212 final escaped = escapeHtml(Strings.join(childLines, '\n'));

	213

	214 return new Element('pre', [new Element.text('code', escaped)]);

	215 }

	216 }

	217

	218 /// Parses horizontal rules like "---", "_ _ _", "* * *", etc.

	219 class HorizontalRuleSyntax extends BlockSyntax {

	220 RegExp get pattern() => _Re.HR;

	221

	222 Node parse(BlockParser parser) {

	223 final match = pattern.firstMatch(parser.current);

	224 parser.advance();

	225 return new Element.empty('hr');

	226 }

	227 }

	228

	229 /// Parses inline HTML at the block level. This differs from other markdown

	230 /// implementations in several ways:

	231 ///

	232 /// 1. This one is way way WAY simpler.

	233 /// 2. All HTML tags at the block level will be treated as blocks. If you start

	234 /// a paragraph with <em>, it will not wrap it in a <p> for you. As soon as

	235 /// it sees something like HTML, it stops mucking with it until it hits the

	236 /// next block.

	237 /// 3. Absolutely no HTML parsing or validation is done. We're a markdown

	238 /// parser not an HTML parser!

	239 class BlockHtmlSyntax extends BlockSyntax {

	240 RegExp get pattern() => _Re.HTML;

	241

	242 bool get canEndBlock() => false;

	243

	244 Node parse(BlockParser parser) {

	245 final childLines = [];

	246

	247 // Eat until we hit a blank line.

	248 while (!parser.isDone && !parser.matches(_Re.EMPTY)) {

	249 childLines.add(parser.current);

	250 parser.advance();

	251 }

	252

	253 return new Text(Strings.join(childLines, '\n'));

	254 }

	255 }

	256

	257 class ListItem {

	258 bool forceBlock = false;

	259 final List<String> lines;

	260

	261 ListItem(this.lines);

	262 }

	263

	264 /// Base class for both ordered and unordered lists.

	265 class ListSyntax extends BlockSyntax {

	266 bool get canEndBlock() => false;

	267

	268 abstract String get listTag();

	269

	270 Node parse(BlockParser parser) {

	271 final items = <ListItem>[];

	272 var childLines = <String>[];

	273

	274 endItem() {

	275 if (childLines.length > 0) {

	276 items.add(new ListItem(childLines));

	277 childLines = <String>[];

	278 }

	279 }

	280

	281 var match;

	282 tryMatch(RegExp pattern) {

	283 match = pattern.firstMatch(parser.current);

	284 return match != null;

	285 }

	286

	287 bool afterEmpty = false;

	288 while (!parser.isDone) {

	289 if (tryMatch(_Re.EMPTY)) {

	290 // Add a blank line to the current list item.

	291 childLines.add('');

	292 } else if (tryMatch(_Re.UL) \|\| tryMatch(_Re.OL)) {

	293 // End the current list item and start a new one.

	294 endItem();

	295 childLines.add(match.group(1));

	296 } else if (tryMatch(_Re.INDENT)) {

	297 // Strip off indent and add to current item.

	298 childLines.add(match.group(1));

	299 } else if (isAtBlockEnd(parser)) {

	300 // Done with the list.

	301 break;

	302 } else {

	303 // Anything else is paragraph text or other stuff that can be in a list

	304 // item. However, if the previous item is a blank line, this means we're

	305 // done with the list and are starting a new top-level paragraph.

	306 if ((childLines.length > 0) && (childLines.last() == '')) break;

	307 childLines.add(parser.current);

	308 }

	309 parser.advance();

	310 }

	311

	312 endItem();

	313

	314 // Markdown, because it hates us, specifies two kinds of list items. If you

	315 // have a list like:

	316 //

	317 // * one

	318 // * two

	319 //

	320 // Then it will insert the conents of the lines directly in the <li>, like:

	321 // <ul>

	322 // <li>one</li>

	323 // <li>two</li>

	324 // <ul>

	325 //

	326 // If, however, there are blank lines between the items, each is wrapped in

	327 // paragraphs:

	328 //

	329 // * one

	330 //

	331 // * two

	332 //

	333 // <ul>

	334 // <li><p>one</p></li>

	335 // <li><p>two</p></li>

	336 // <ul>

	337 //

	338 // In other words, sometimes we parse the contents of a list item like a

	339 // block, and sometimes line an inline. The rules our parser implements are:

	340 //

	341 // - If it has more than one line, it's a block.

	342 // - If the line matches any block parser (BLOCKQUOTE, HEADER, HR, INDENT,

	343 // UL, OL) it's a block. (This is for cases like "* > quote".)

	344 // - If there was a blank line between this item and the previous one, it's

	345 // a block.

	346 // - If there was a blank line between this item and the next one, it's a

	347 // block.

	348 // - Otherwise, parse it as an inline.

	349

	350 // Remove any trailing empty lines and note which items are separated by

	351 // empty lines. Do this before seeing which items are single-line so that

	352 // trailing empty lines on the last item don't force it into being a block.

	353 for (int i = 0; i < items.length; i++) {

	354 for (int j = items[i].lines.length - 1; j > 0; j--) {

	355 if (_Re.EMPTY.firstMatch(items[i].lines[j]) != null) {

	356 // Found an empty line. Item and one after it are blocks.

	357 if (i < items.length - 1) {

	358 items[i].forceBlock = true;

	359 items[i + 1].forceBlock = true;

	360 }

	361 items[i].lines.removeLast();

	362 } else {

	363 break;

	364 }

	365 }

	366 }

	367

	368 // Convert the list items to Nodes.

	369 final itemNodes = <Node>[];

	370 for (final item in items) {

	371 bool blockItem = item.forceBlock \|\| (item.lines.length > 1);

	372

	373 // See if it matches some block parser.

	374 if (!blockItem) {

	375 for (final pattern in _Re.BLOCKS_IN_LIST) {

	376 if (pattern.firstMatch(item.lines[0]) != null) {

	377 blockItem = true;

	378 break;

	379 }

	380 }

	381 }

	382

	383 // Parse the item as a block or inline.

	384 if (blockItem) {

	385 // Block list item.

	386 final children = parser.document.parseLines(item.lines);

	387 itemNodes.add(new Element('li', children));

	388 } else {

	389 // Raw list item.

	390 final contents = parser.document.parseInline(item.lines[0]);

	391 itemNodes.add(new Element('li', contents));

	392 }

	393 }

	394

	395 return new Element(listTag, itemNodes);

	396 }

	397 }

	398

	399 /// Parses unordered lists.

	400 class UnorderedListSyntax extends ListSyntax {

	401 RegExp get pattern() => _Re.UL;

	402 String get listTag() => 'ul';

	403 }

	404

	405 /// Parses ordered lists.

	406 class OrderedListSyntax extends ListSyntax {

	407 RegExp get pattern() => _Re.OL;

	408 String get listTag() => 'ol';

	409 }

	410

	411 /// Parses paragraphs of regular text.

	412 class ParagraphSyntax extends BlockSyntax {

	413 bool get canEndBlock() => false;

	414

	415 bool canParse(BlockParser parser) => true;

	416

	417 Node parse(BlockParser parser) {

	418 final childLines = [];

	419

	420 // Eat until we hit something that ends a paragraph.

	421 while (!isAtBlockEnd(parser)) {

	422 childLines.add(parser.current);

	423 parser.advance();

	424 }

	425

	426 final contents = parser.document.parseInline(

	427 Strings.join(childLines, '\n'));

	428 return new Element('p', contents);

	429 }

	430 }

OLD	NEW

« utils/markdown/ast.dart ('K') | « utils/markdown/ast.dart ('k') | utils/markdown/html_renderer.dart » ('j') | utils/markdown/inline_parser.dart » ('J')