Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(95)

Side by Side Diff: utils/markdown/block_parser.dart

Issue 8680025: First pass at a markdown parser in Dart. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 9 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 class _Re {
Jennifer Messerly 2011/11/23 22:25:41 what is this Java doing in my Dart code? :) Serio
Bob Nystrom 2011/11/29 02:56:29 Reorganized. I don't know what I was thinking. The
6 /// The line contains only whitespace or is empty.
7 static final EMPTY = const RegExp(@'^([ \t]*)$');
8
9 /// A series of "=" or "-" (on the next line) define setext-style headers.
10 static final SETEXT = const RegExp(@'^((=+)|(-+))$');
11
12 /// Leading (and trailing) "#" define atx-style headers.
13 static final HEADER = const RegExp(@'^(#{1,6})(.*?)#*$');
14
15 /// The line starts with ">" with one optional space after.
16 static final BLOCKQUOTE = const RegExp(@'^[ ]{0,3}>[ ]?(.*)$');
17
18 /// A line indented four spaces. Used for code blocks and lists.
19 static final INDENT = const RegExp(@'^(?: |\t)(.*)$');
20
21 /// Three or more hyphens, asterisks or underscores by themselves. Note that
22 /// a line like "----" is valid as both HR and SETEXT. In case of a tie,
23 /// SETEXT should win.
24 static final HR = const RegExp(@'^[ ]{0,3}((-+[ ]{0,2}){3,}|' +
25 @'(_+[ ]{0,2}){3,}|' +
26 @'(\*+[ ]{0,2}){3,})$');
27
28 /// Really hacky way to detect block-level embedded HTML. Just looks for
29 /// "<somename".
30 static final HTML = const RegExp(@'^<[ ]*\w+[ >]');
31
32 /// A line starting with one of these markers: "-", "*", "+". May have up to
33 /// three leading spaces before the marker and any number of spaces or tabs
34 /// after.
35 static final UL = const RegExp(@'^[ ]{0,3}[*+-][ \t]+(.*)$');
36
37 /// A line starting with a number like "123.". May have up to three leading
38 /// spaces before the marker and any number of spaces or tabs after.
39 static final OL = const RegExp(@'^[ ]{0,3}\d+\.[ \t]+(.*)$');
40
41 /// These patterns when appearing in a single-line list item will force the
42 /// item to be parsed as a block.
43 static final BLOCKS_IN_LIST = const [BLOCKQUOTE, HEADER, HR, INDENT, UL, OL];
44 }
45
46 /// Maintains the internal state needed to parse a series of lines into blocks
47 /// of markdown suitable for further inline parsing.
48 class BlockParser {
49 final List<String> lines;
50
51 /// The markdown document this parser is parsing.
52 final Document document;
53
54 /// Index of the current line.
55 int pos;
56
57 BlockParser(this.lines, this.document)
58 : pos = 0;
59
60 /// Gets the current line.
61 String get current() => lines[pos];
62
63 /// Gets the line after the current one or `null` if there is none.
64 String get next() {
65 // Don't read past the end.
66 if (pos >= lines.length - 1) return null;
67 return lines[pos + 1];
68 }
69
70 void advance() => pos++;
71 bool get isDone() => pos >= lines.length;
72
73 /// Gets whether or not the current line matches the given pattern.
74 bool matches(RegExp regex) {
75 if (isDone) return false;
76 return regex.firstMatch(current) != null;
77 }
78
79 /// Gets whether or not the current line matches the given pattern.
80 bool matchesNext(RegExp regex) {
81 if (next == null) return false;
82 return regex.firstMatch(next) != null;
83 }
84 }
85
86 class BlockSyntax {
87 /// Gets the collection of built-in block parsers. To turn a series of lines
88 /// into blocks, each of these will be tried in turn. Order matters here.
89 static List<BlockSyntax> get syntaxes() {
90 // Lazy initialize.
91 if (_syntaxes == null) {
92 _syntaxes = [
93 new EmptyBlockSyntax(),
Jennifer Messerly 2011/11/23 22:25:41 could these be const, and then use a "static final
Bob Nystrom 2011/11/29 02:56:29 They could be, but I'm thinking users may be able
94 new BlockHtmlSyntax(),
95 new SetextHeaderSyntax(),
96 new HeaderSyntax(),
97 new CodeBlockSyntax(),
98 new BlockquoteSyntax(),
99 new HorizontalRuleSyntax(),
100 new UnorderedListSyntax(),
101 new OrderedListSyntax(),
102 new ParagraphSyntax()
103 ];
104 }
105
106 return _syntaxes;
107 }
108
109 static List<BlockSyntax> _syntaxes;
110
111 /// Gets the regex used to identify the beginning of this block, if any.
112 RegExp get pattern() => null;
113
114 bool get canEndBlock() => true;
115
116 bool canParse(BlockParser parser) {
117 return pattern.firstMatch(parser.current) != null;
118 }
119
120 abstract Node parse(BlockParser parser);
121
122 List<Node> parseChildLines(BlockParser parser) {
123 // Grab all of the lines that form the blockquote, stripping off the ">".
124 final childLines = [];
125
126 while (!parser.isDone) {
127 final match = pattern.firstMatch(parser.current);
128 if (match == null) break;
129 childLines.add(match.group(1));
130 parser.advance();
131 }
132
133 return childLines;
134 }
135
136 /// Gets whether or not [parser]'s current line should end the previous block.
137 static bool isAtBlockEnd(BlockParser parser) {
138 if (parser.isDone) return true;
139 return syntaxes.some((s) => s.canParse(parser) && s.canEndBlock);
140 }
141 }
142
143 class EmptyBlockSyntax extends BlockSyntax {
144 RegExp get pattern() => _Re.EMPTY;
145
146 Node parse(BlockParser parser) {
147 parser.advance();
148
149 // Don't actually emit anything.
150 return null;
151 }
152 }
153
154 /// Parses setext-style headers.
155 class SetextHeaderSyntax extends BlockSyntax {
156 bool canParse(BlockParser parser) {
157 // Note: matches *next* line, not the current one. We're looking for the
158 // underlining after this line.
159 return parser.matchesNext(_Re.SETEXT);
160 }
161
162 Node parse(BlockParser parser) {
163 final match = _Re.SETEXT.firstMatch(parser.next);
164
165 final tag = (match.group(1)[0] == '=') ? 'h1' : 'h2';
166 final contents = parser.document.parseInline(parser.current);
167 parser.advance();
168 parser.advance();
169
170 return new Element(tag, contents);
171 }
172 }
173
174 /// Parses atx-style headers: "## Header ##".
175 class HeaderSyntax extends BlockSyntax {
176 RegExp get pattern() => _Re.HEADER;
177
178 Node parse(BlockParser parser) {
179 final match = pattern.firstMatch(parser.current);
180 parser.advance();
181 final level = match.group(1).length;
182 final contents = parser.document.parseInline(match.group(2).trim());
183 return new Element('h$level', contents);
184 }
185 }
186
187 /// Parses email-style blockquotes: "> quote".
188 class BlockquoteSyntax extends BlockSyntax {
189 RegExp get pattern() => _Re.BLOCKQUOTE;
190
191 Node parse(BlockParser parser) {
192 final childLines = parseChildLines(parser);
193
194 // Recursively parse the contents of the blockquote.
195 final children = parser.document.parseLines(childLines);
196
197 return new Element('blockquote', children);
198 }
199 }
200
201 /// Parses preformatted code blocks that are indented four spaces.
202 class CodeBlockSyntax extends BlockSyntax {
203 RegExp get pattern() => _Re.INDENT;
204
205 Node parse(BlockParser parser) {
206 final childLines = parseChildLines(parser);
207
208 // The Markdown tests expect a trailing newline.
209 childLines.add('');
210
211 // Escape the code.
212 final escaped = escapeHtml(Strings.join(childLines, '\n'));
213
214 return new Element('pre', [new Element.text('code', escaped)]);
215 }
216 }
217
218 /// Parses horizontal rules like "---", "_ _ _", "* * *", etc.
219 class HorizontalRuleSyntax extends BlockSyntax {
220 RegExp get pattern() => _Re.HR;
221
222 Node parse(BlockParser parser) {
223 final match = pattern.firstMatch(parser.current);
224 parser.advance();
225 return new Element.empty('hr');
226 }
227 }
228
229 /// Parses inline HTML at the block level. This differs from other markdown
230 /// implementations in several ways:
231 ///
232 /// 1. This one is way way WAY simpler.
233 /// 2. All HTML tags at the block level will be treated as blocks. If you start
234 /// a paragraph with <em>, it will not wrap it in a <p> for you. As soon as
235 /// it sees something like HTML, it stops mucking with it until it hits the
236 /// next block.
237 /// 3. Absolutely no HTML parsing or validation is done. We're a markdown
238 /// parser not an HTML parser!
239 class BlockHtmlSyntax extends BlockSyntax {
240 RegExp get pattern() => _Re.HTML;
241
242 bool get canEndBlock() => false;
243
244 Node parse(BlockParser parser) {
245 final childLines = [];
246
247 // Eat until we hit a blank line.
248 while (!parser.isDone && !parser.matches(_Re.EMPTY)) {
249 childLines.add(parser.current);
250 parser.advance();
251 }
252
253 return new Text(Strings.join(childLines, '\n'));
254 }
255 }
256
257 class ListItem {
258 bool forceBlock = false;
259 final List<String> lines;
260
261 ListItem(this.lines);
262 }
263
264 /// Base class for both ordered and unordered lists.
265 class ListSyntax extends BlockSyntax {
266 bool get canEndBlock() => false;
267
268 abstract String get listTag();
269
270 Node parse(BlockParser parser) {
271 final items = <ListItem>[];
272 var childLines = <String>[];
273
274 endItem() {
275 if (childLines.length > 0) {
276 items.add(new ListItem(childLines));
277 childLines = <String>[];
278 }
279 }
280
281 var match;
282 tryMatch(RegExp pattern) {
283 match = pattern.firstMatch(parser.current);
284 return match != null;
285 }
286
287 bool afterEmpty = false;
288 while (!parser.isDone) {
289 if (tryMatch(_Re.EMPTY)) {
290 // Add a blank line to the current list item.
291 childLines.add('');
292 } else if (tryMatch(_Re.UL) || tryMatch(_Re.OL)) {
293 // End the current list item and start a new one.
294 endItem();
295 childLines.add(match.group(1));
296 } else if (tryMatch(_Re.INDENT)) {
297 // Strip off indent and add to current item.
298 childLines.add(match.group(1));
299 } else if (isAtBlockEnd(parser)) {
300 // Done with the list.
301 break;
302 } else {
303 // Anything else is paragraph text or other stuff that can be in a list
304 // item. However, if the previous item is a blank line, this means we're
305 // done with the list and are starting a new top-level paragraph.
306 if ((childLines.length > 0) && (childLines.last() == '')) break;
307 childLines.add(parser.current);
308 }
309 parser.advance();
310 }
311
312 endItem();
313
314 // Markdown, because it hates us, specifies two kinds of list items. If you
315 // have a list like:
316 //
317 // * one
318 // * two
319 //
320 // Then it will insert the conents of the lines directly in the <li>, like:
321 // <ul>
322 // <li>one</li>
323 // <li>two</li>
324 // <ul>
325 //
326 // If, however, there are blank lines between the items, each is wrapped in
327 // paragraphs:
328 //
329 // * one
330 //
331 // * two
332 //
333 // <ul>
334 // <li><p>one</p></li>
335 // <li><p>two</p></li>
336 // <ul>
337 //
338 // In other words, sometimes we parse the contents of a list item like a
339 // block, and sometimes line an inline. The rules our parser implements are:
340 //
341 // - If it has more than one line, it's a block.
342 // - If the line matches any block parser (BLOCKQUOTE, HEADER, HR, INDENT,
343 // UL, OL) it's a block. (This is for cases like "* > quote".)
344 // - If there was a blank line between this item and the previous one, it's
345 // a block.
346 // - If there was a blank line between this item and the next one, it's a
347 // block.
348 // - Otherwise, parse it as an inline.
349
350 // Remove any trailing empty lines and note which items are separated by
351 // empty lines. Do this before seeing which items are single-line so that
352 // trailing empty lines on the last item don't force it into being a block.
353 for (int i = 0; i < items.length; i++) {
354 for (int j = items[i].lines.length - 1; j > 0; j--) {
355 if (_Re.EMPTY.firstMatch(items[i].lines[j]) != null) {
356 // Found an empty line. Item and one after it are blocks.
357 if (i < items.length - 1) {
358 items[i].forceBlock = true;
359 items[i + 1].forceBlock = true;
360 }
361 items[i].lines.removeLast();
362 } else {
363 break;
364 }
365 }
366 }
367
368 // Convert the list items to Nodes.
369 final itemNodes = <Node>[];
370 for (final item in items) {
371 bool blockItem = item.forceBlock || (item.lines.length > 1);
372
373 // See if it matches some block parser.
374 if (!blockItem) {
375 for (final pattern in _Re.BLOCKS_IN_LIST) {
376 if (pattern.firstMatch(item.lines[0]) != null) {
377 blockItem = true;
378 break;
379 }
380 }
381 }
382
383 // Parse the item as a block or inline.
384 if (blockItem) {
385 // Block list item.
386 final children = parser.document.parseLines(item.lines);
387 itemNodes.add(new Element('li', children));
388 } else {
389 // Raw list item.
390 final contents = parser.document.parseInline(item.lines[0]);
391 itemNodes.add(new Element('li', contents));
392 }
393 }
394
395 return new Element(listTag, itemNodes);
396 }
397 }
398
399 /// Parses unordered lists.
400 class UnorderedListSyntax extends ListSyntax {
401 RegExp get pattern() => _Re.UL;
402 String get listTag() => 'ul';
403 }
404
405 /// Parses ordered lists.
406 class OrderedListSyntax extends ListSyntax {
407 RegExp get pattern() => _Re.OL;
408 String get listTag() => 'ol';
409 }
410
411 /// Parses paragraphs of regular text.
412 class ParagraphSyntax extends BlockSyntax {
413 bool get canEndBlock() => false;
414
415 bool canParse(BlockParser parser) => true;
416
417 Node parse(BlockParser parser) {
418 final childLines = [];
419
420 // Eat until we hit something that ends a paragraph.
421 while (!isAtBlockEnd(parser)) {
422 childLines.add(parser.current);
423 parser.advance();
424 }
425
426 final contents = parser.document.parseInline(
427 Strings.join(childLines, '\n'));
428 return new Element('p', contents);
429 }
430 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698