Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(175)

Side by Side Diff: utils/markdown/block_parser.dart

Issue 8680025: First pass at a markdown parser in Dart. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Review. Add missing file (oops!). Created 9 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « utils/markdown/ast.dart ('k') | utils/markdown/html_renderer.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 /// The line contains only whitespace or is empty.
6 final _RE_EMPTY = const RegExp(@'^([ \t]*)$');
7
8 /// A series of "=" or "-" (on the next line) define setext-style headers.
9 final _RE_SETEXT = const RegExp(@'^((=+)|(-+))$');
10
11 /// Leading (and trailing) "#" define atx-style headers.
12 final _RE_HEADER = const RegExp(@'^(#{1,6})(.*?)#*$');
13
14 /// The line starts with ">" with one optional space after.
15 final _RE_BLOCKQUOTE = const RegExp(@'^[ ]{0,3}>[ ]?(.*)$');
16
17 /// A line indented four spaces. Used for code blocks and lists.
18 final _RE_INDENT = const RegExp(@'^(?: |\t)(.*)$');
19
20 /// Three or more hyphens, asterisks or underscores by themselves. Note that
21 /// a line like "----" is valid as both HR and SETEXT. In case of a tie,
22 /// SETEXT should win.
23 final _RE_HR = const RegExp(@'^[ ]{0,3}((-+[ ]{0,2}){3,}|' +
24 @'(_+[ ]{0,2}){3,}|' +
25 @'(\*+[ ]{0,2}){3,})$');
26
27 /// Really hacky way to detect block-level embedded HTML. Just looks for
28 /// "<somename".
29 final _RE_HTML = const RegExp(@'^<[ ]*\w+[ >]');
30
31 /// A line starting with one of these markers: "-", "*", "+". May have up to
32 /// three leading spaces before the marker and any number of spaces or tabs
33 /// after.
34 final _RE_UL = const RegExp(@'^[ ]{0,3}[*+-][ \t]+(.*)$');
35
36 /// A line starting with a number like "123.". May have up to three leading
37 /// spaces before the marker and any number of spaces or tabs after.
38 final _RE_OL = const RegExp(@'^[ ]{0,3}\d+\.[ \t]+(.*)$');
39
40 /// Maintains the internal state needed to parse a series of lines into blocks
41 /// of markdown suitable for further inline parsing.
42 class BlockParser {
43 final List<String> lines;
44
45 /// The markdown document this parser is parsing.
46 final Document document;
47
48 /// Index of the current line.
49 int pos;
50
51 BlockParser(this.lines, this.document)
52 : pos = 0;
53
54 /// Gets the current line.
55 String get current() => lines[pos];
56
57 /// Gets the line after the current one or `null` if there is none.
58 String get next() {
59 // Don't read past the end.
60 if (pos >= lines.length - 1) return null;
61 return lines[pos + 1];
62 }
63
64 void advance() => pos++;
65 bool get isDone() => pos >= lines.length;
66
67 /// Gets whether or not the current line matches the given pattern.
68 bool matches(RegExp regex) {
69 if (isDone) return false;
70 return regex.firstMatch(current) != null;
71 }
72
73 /// Gets whether or not the current line matches the given pattern.
74 bool matchesNext(RegExp regex) {
75 if (next == null) return false;
76 return regex.firstMatch(next) != null;
77 }
78 }
79
80 class BlockSyntax {
81 /// Gets the collection of built-in block parsers. To turn a series of lines
82 /// into blocks, each of these will be tried in turn. Order matters here.
83 static List<BlockSyntax> get syntaxes() {
84 // Lazy initialize.
85 if (_syntaxes == null) {
86 _syntaxes = [
87 new EmptyBlockSyntax(),
88 new BlockHtmlSyntax(),
89 new SetextHeaderSyntax(),
90 new HeaderSyntax(),
91 new CodeBlockSyntax(),
92 new BlockquoteSyntax(),
93 new HorizontalRuleSyntax(),
94 new UnorderedListSyntax(),
95 new OrderedListSyntax(),
96 new ParagraphSyntax()
97 ];
98 }
99
100 return _syntaxes;
101 }
102
103 static List<BlockSyntax> _syntaxes;
104
105 /// Gets the regex used to identify the beginning of this block, if any.
106 RegExp get pattern() => null;
107
108 bool get canEndBlock() => true;
109
110 bool canParse(BlockParser parser) {
111 return pattern.firstMatch(parser.current) != null;
112 }
113
114 abstract Node parse(BlockParser parser);
115
116 List<Node> parseChildLines(BlockParser parser) {
117 // Grab all of the lines that form the blockquote, stripping off the ">".
118 final childLines = [];
119
120 while (!parser.isDone) {
121 final match = pattern.firstMatch(parser.current);
122 if (match == null) break;
123 childLines.add(match.group(1));
124 parser.advance();
125 }
126
127 return childLines;
128 }
129
130 /// Gets whether or not [parser]'s current line should end the previous block.
131 static bool isAtBlockEnd(BlockParser parser) {
132 if (parser.isDone) return true;
133 return syntaxes.some((s) => s.canParse(parser) && s.canEndBlock);
134 }
135 }
136
137 class EmptyBlockSyntax extends BlockSyntax {
138 RegExp get pattern() => _RE_EMPTY;
139
140 Node parse(BlockParser parser) {
141 parser.advance();
142
143 // Don't actually emit anything.
144 return null;
145 }
146 }
147
148 /// Parses setext-style headers.
149 class SetextHeaderSyntax extends BlockSyntax {
150 bool canParse(BlockParser parser) {
151 // Note: matches *next* line, not the current one. We're looking for the
152 // underlining after this line.
153 return parser.matchesNext(_RE_SETEXT);
154 }
155
156 Node parse(BlockParser parser) {
157 final match = _RE_SETEXT.firstMatch(parser.next);
158
159 final tag = (match.group(1)[0] == '=') ? 'h1' : 'h2';
160 final contents = parser.document.parseInline(parser.current);
161 parser.advance();
162 parser.advance();
163
164 return new Element(tag, contents);
165 }
166 }
167
168 /// Parses atx-style headers: "## Header ##".
169 class HeaderSyntax extends BlockSyntax {
170 RegExp get pattern() => _RE_HEADER;
171
172 Node parse(BlockParser parser) {
173 final match = pattern.firstMatch(parser.current);
174 parser.advance();
175 final level = match.group(1).length;
176 final contents = parser.document.parseInline(match.group(2).trim());
177 return new Element('h$level', contents);
178 }
179 }
180
181 /// Parses email-style blockquotes: "> quote".
182 class BlockquoteSyntax extends BlockSyntax {
183 RegExp get pattern() => _RE_BLOCKQUOTE;
184
185 Node parse(BlockParser parser) {
186 final childLines = parseChildLines(parser);
187
188 // Recursively parse the contents of the blockquote.
189 final children = parser.document.parseLines(childLines);
190
191 return new Element('blockquote', children);
192 }
193 }
194
195 /// Parses preformatted code blocks that are indented four spaces.
196 class CodeBlockSyntax extends BlockSyntax {
197 RegExp get pattern() => _RE_INDENT;
198
199 Node parse(BlockParser parser) {
200 final childLines = parseChildLines(parser);
201
202 // The Markdown tests expect a trailing newline.
203 childLines.add('');
204
205 // Escape the code.
206 final escaped = escapeHtml(Strings.join(childLines, '\n'));
207
208 return new Element('pre', [new Element.text('code', escaped)]);
209 }
210 }
211
212 /// Parses horizontal rules like "---", "_ _ _", "* * *", etc.
213 class HorizontalRuleSyntax extends BlockSyntax {
214 RegExp get pattern() => _RE_HR;
215
216 Node parse(BlockParser parser) {
217 final match = pattern.firstMatch(parser.current);
218 parser.advance();
219 return new Element.empty('hr');
220 }
221 }
222
223 /// Parses inline HTML at the block level. This differs from other markdown
224 /// implementations in several ways:
225 ///
226 /// 1. This one is way way WAY simpler.
227 /// 2. All HTML tags at the block level will be treated as blocks. If you start
228 /// a paragraph with <em>, it will not wrap it in a <p> for you. As soon as
229 /// it sees something like HTML, it stops mucking with it until it hits the
230 /// next block.
231 /// 3. Absolutely no HTML parsing or validation is done. We're a markdown
232 /// parser not an HTML parser!
233 class BlockHtmlSyntax extends BlockSyntax {
234 RegExp get pattern() => _RE_HTML;
235
236 bool get canEndBlock() => false;
237
238 Node parse(BlockParser parser) {
239 final childLines = [];
240
241 // Eat until we hit a blank line.
242 while (!parser.isDone && !parser.matches(_RE_EMPTY)) {
243 childLines.add(parser.current);
244 parser.advance();
245 }
246
247 return new Text(Strings.join(childLines, '\n'));
248 }
249 }
250
251 class ListItem {
252 bool forceBlock = false;
253 final List<String> lines;
254
255 ListItem(this.lines);
256 }
257
258 /// Base class for both ordered and unordered lists.
259 class ListSyntax extends BlockSyntax {
260 bool get canEndBlock() => false;
261
262 abstract String get listTag();
263
264 Node parse(BlockParser parser) {
265 final items = <ListItem>[];
266 var childLines = <String>[];
267
268 endItem() {
269 if (childLines.length > 0) {
270 items.add(new ListItem(childLines));
271 childLines = <String>[];
272 }
273 }
274
275 var match;
276 tryMatch(RegExp pattern) {
277 match = pattern.firstMatch(parser.current);
278 return match != null;
279 }
280
281 bool afterEmpty = false;
282 while (!parser.isDone) {
283 if (tryMatch(_RE_EMPTY)) {
284 // Add a blank line to the current list item.
285 childLines.add('');
286 } else if (tryMatch(_RE_UL) || tryMatch(_RE_OL)) {
287 // End the current list item and start a new one.
288 endItem();
289 childLines.add(match.group(1));
290 } else if (tryMatch(_RE_INDENT)) {
291 // Strip off indent and add to current item.
292 childLines.add(match.group(1));
293 } else if (isAtBlockEnd(parser)) {
294 // Done with the list.
295 break;
296 } else {
297 // Anything else is paragraph text or other stuff that can be in a list
298 // item. However, if the previous item is a blank line, this means we're
299 // done with the list and are starting a new top-level paragraph.
300 if ((childLines.length > 0) && (childLines.last() == '')) break;
301 childLines.add(parser.current);
302 }
303 parser.advance();
304 }
305
306 endItem();
307
308 // Markdown, because it hates us, specifies two kinds of list items. If you
309 // have a list like:
310 //
311 // * one
312 // * two
313 //
314 // Then it will insert the conents of the lines directly in the <li>, like:
315 // <ul>
316 // <li>one</li>
317 // <li>two</li>
318 // <ul>
319 //
320 // If, however, there are blank lines between the items, each is wrapped in
321 // paragraphs:
322 //
323 // * one
324 //
325 // * two
326 //
327 // <ul>
328 // <li><p>one</p></li>
329 // <li><p>two</p></li>
330 // <ul>
331 //
332 // In other words, sometimes we parse the contents of a list item like a
333 // block, and sometimes line an inline. The rules our parser implements are:
334 //
335 // - If it has more than one line, it's a block.
336 // - If the line matches any block parser (BLOCKQUOTE, HEADER, HR, INDENT,
337 // UL, OL) it's a block. (This is for cases like "* > quote".)
338 // - If there was a blank line between this item and the previous one, it's
339 // a block.
340 // - If there was a blank line between this item and the next one, it's a
341 // block.
342 // - Otherwise, parse it as an inline.
343
344 // Remove any trailing empty lines and note which items are separated by
345 // empty lines. Do this before seeing which items are single-line so that
346 // trailing empty lines on the last item don't force it into being a block.
347 for (int i = 0; i < items.length; i++) {
348 for (int j = items[i].lines.length - 1; j > 0; j--) {
349 if (_RE_EMPTY.firstMatch(items[i].lines[j]) != null) {
350 // Found an empty line. Item and one after it are blocks.
351 if (i < items.length - 1) {
352 items[i].forceBlock = true;
353 items[i + 1].forceBlock = true;
354 }
355 items[i].lines.removeLast();
356 } else {
357 break;
358 }
359 }
360 }
361
362 // Convert the list items to Nodes.
363 final itemNodes = <Node>[];
364 for (final item in items) {
365 bool blockItem = item.forceBlock || (item.lines.length > 1);
366
367 // See if it matches some block parser.
368 final blocksInList = const [
369 _RE_BLOCKQUOTE,
370 _RE_HEADER,
371 _RE_HR,
372 _RE_INDENT,
373 _RE_UL,
374 _RE_OL
375 ];
376
377 if (!blockItem) {
378 for (final pattern in blocksInList) {
379 if (pattern.firstMatch(item.lines[0]) != null) {
380 blockItem = true;
381 break;
382 }
383 }
384 }
385
386 // Parse the item as a block or inline.
387 if (blockItem) {
388 // Block list item.
389 final children = parser.document.parseLines(item.lines);
390 itemNodes.add(new Element('li', children));
391 } else {
392 // Raw list item.
393 final contents = parser.document.parseInline(item.lines[0]);
394 itemNodes.add(new Element('li', contents));
395 }
396 }
397
398 return new Element(listTag, itemNodes);
399 }
400 }
401
402 /// Parses unordered lists.
403 class UnorderedListSyntax extends ListSyntax {
404 RegExp get pattern() => _RE_UL;
405 String get listTag() => 'ul';
406 }
407
408 /// Parses ordered lists.
409 class OrderedListSyntax extends ListSyntax {
410 RegExp get pattern() => _RE_OL;
411 String get listTag() => 'ol';
412 }
413
414 /// Parses paragraphs of regular text.
415 class ParagraphSyntax extends BlockSyntax {
416 bool get canEndBlock() => false;
417
418 bool canParse(BlockParser parser) => true;
419
420 Node parse(BlockParser parser) {
421 final childLines = [];
422
423 // Eat until we hit something that ends a paragraph.
424 while (!isAtBlockEnd(parser)) {
425 childLines.add(parser.current);
426 parser.advance();
427 }
428
429 final contents = parser.document.parseInline(
430 Strings.join(childLines, '\n'));
431 return new Element('p', contents);
432 }
433 }
OLDNEW
« no previous file with comments | « utils/markdown/ast.dart ('k') | utils/markdown/html_renderer.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698