Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(21)

Side by Side Diff: utils/markdown/block_parser.dart

Issue 8953042: Move markdown library. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Add markdown tests to dartdoc. Created 9 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « utils/markdown/ast.dart ('k') | utils/markdown/html_renderer.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 /// The line contains only whitespace or is empty.
6 final _RE_EMPTY = const RegExp(@'^([ \t]*)$');
7
8 /// A series of `=` or `-` (on the next line) define setext-style headers.
9 final _RE_SETEXT = const RegExp(@'^((=+)|(-+))$');
10
11 /// Leading (and trailing) `#` define atx-style headers.
12 final _RE_HEADER = const RegExp(@'^(#{1,6})(.*?)#*$');
13
14 /// The line starts with `>` with one optional space after.
15 final _RE_BLOCKQUOTE = const RegExp(@'^[ ]{0,3}>[ ]?(.*)$');
16
17 /// A line indented four spaces. Used for code blocks and lists.
18 final _RE_INDENT = const RegExp(@'^(?: |\t)(.*)$');
19
20 /// Three or more hyphens, asterisks or underscores by themselves. Note that
21 /// a line like `----` is valid as both HR and SETEXT. In case of a tie,
22 /// SETEXT should win.
23 final _RE_HR = const RegExp(@'^[ ]{0,3}((-+[ ]{0,2}){3,}|' +
24 @'(_+[ ]{0,2}){3,}|' +
25 @'(\*+[ ]{0,2}){3,})$');
26
27 /// Really hacky way to detect block-level embedded HTML. Just looks for
28 /// "<somename".
29 final _RE_HTML = const RegExp(@'^<[ ]*\w+[ >]');
30
31 /// A line starting with one of these markers: `-`, `*`, `+`. May have up to
32 /// three leading spaces before the marker and any number of spaces or tabs
33 /// after.
34 final _RE_UL = const RegExp(@'^[ ]{0,3}[*+-][ \t]+(.*)$');
35
36 /// A line starting with a number like `123.`. May have up to three leading
37 /// spaces before the marker and any number of spaces or tabs after.
38 final _RE_OL = const RegExp(@'^[ ]{0,3}\d+\.[ \t]+(.*)$');
39
40 /// Maintains the internal state needed to parse a series of lines into blocks
41 /// of markdown suitable for further inline parsing.
42 class BlockParser {
43 final List<String> lines;
44
45 /// The markdown document this parser is parsing.
46 final Document document;
47
48 /// Index of the current line.
49 int pos;
50
51 BlockParser(this.lines, this.document)
52 : pos = 0;
53
54 /// Gets the current line.
55 String get current() => lines[pos];
56
57 /// Gets the line after the current one or `null` if there is none.
58 String get next() {
59 // Don't read past the end.
60 if (pos >= lines.length - 1) return null;
61 return lines[pos + 1];
62 }
63
64 void advance() {
65 pos++;
66 }
67
68 bool get isDone() => pos >= lines.length;
69
70 /// Gets whether or not the current line matches the given pattern.
71 bool matches(RegExp regex) {
72 if (isDone) return false;
73 return regex.firstMatch(current) != null;
74 }
75
76 /// Gets whether or not the current line matches the given pattern.
77 bool matchesNext(RegExp regex) {
78 if (next == null) return false;
79 return regex.firstMatch(next) != null;
80 }
81 }
82
83 class BlockSyntax {
84 /// Gets the collection of built-in block parsers. To turn a series of lines
85 /// into blocks, each of these will be tried in turn. Order matters here.
86 static List<BlockSyntax> get syntaxes() {
87 // Lazy initialize.
88 if (_syntaxes == null) {
89 _syntaxes = [
90 new EmptyBlockSyntax(),
91 new BlockHtmlSyntax(),
92 new SetextHeaderSyntax(),
93 new HeaderSyntax(),
94 new CodeBlockSyntax(),
95 new BlockquoteSyntax(),
96 new HorizontalRuleSyntax(),
97 new UnorderedListSyntax(),
98 new OrderedListSyntax(),
99 new ParagraphSyntax()
100 ];
101 }
102
103 return _syntaxes;
104 }
105
106 static List<BlockSyntax> _syntaxes;
107
108 /// Gets the regex used to identify the beginning of this block, if any.
109 RegExp get pattern() => null;
110
111 bool get canEndBlock() => true;
112
113 bool canParse(BlockParser parser) {
114 return pattern.firstMatch(parser.current) != null;
115 }
116
117 abstract Node parse(BlockParser parser);
118
119 List<String> parseChildLines(BlockParser parser) {
120 // Grab all of the lines that form the blockquote, stripping off the ">".
121 final childLines = <String>[];
122
123 while (!parser.isDone) {
124 final match = pattern.firstMatch(parser.current);
125 if (match == null) break;
126 childLines.add(match[1]);
127 parser.advance();
128 }
129
130 return childLines;
131 }
132
133 /// Gets whether or not [parser]'s current line should end the previous block.
134 static bool isAtBlockEnd(BlockParser parser) {
135 if (parser.isDone) return true;
136 return syntaxes.some((s) => s.canParse(parser) && s.canEndBlock);
137 }
138 }
139
140 class EmptyBlockSyntax extends BlockSyntax {
141 RegExp get pattern() => _RE_EMPTY;
142
143 Node parse(BlockParser parser) {
144 parser.advance();
145
146 // Don't actually emit anything.
147 return null;
148 }
149 }
150
151 /// Parses setext-style headers.
152 class SetextHeaderSyntax extends BlockSyntax {
153 bool canParse(BlockParser parser) {
154 // Note: matches *next* line, not the current one. We're looking for the
155 // underlining after this line.
156 return parser.matchesNext(_RE_SETEXT);
157 }
158
159 Node parse(BlockParser parser) {
160 final match = _RE_SETEXT.firstMatch(parser.next);
161
162 final tag = (match[1][0] == '=') ? 'h1' : 'h2';
163 final contents = parser.document.parseInline(parser.current);
164 parser.advance();
165 parser.advance();
166
167 return new Element(tag, contents);
168 }
169 }
170
171 /// Parses atx-style headers: `## Header ##`.
172 class HeaderSyntax extends BlockSyntax {
173 RegExp get pattern() => _RE_HEADER;
174
175 Node parse(BlockParser parser) {
176 final match = pattern.firstMatch(parser.current);
177 parser.advance();
178 final level = match[1].length;
179 final contents = parser.document.parseInline(match[2].trim());
180 return new Element('h$level', contents);
181 }
182 }
183
184 /// Parses email-style blockquotes: `> quote`.
185 class BlockquoteSyntax extends BlockSyntax {
186 RegExp get pattern() => _RE_BLOCKQUOTE;
187
188 Node parse(BlockParser parser) {
189 final childLines = parseChildLines(parser);
190
191 // Recursively parse the contents of the blockquote.
192 final children = parser.document.parseLines(childLines);
193
194 return new Element('blockquote', children);
195 }
196 }
197
198 /// Parses preformatted code blocks that are indented four spaces.
199 class CodeBlockSyntax extends BlockSyntax {
200 RegExp get pattern() => _RE_INDENT;
201
202 Node parse(BlockParser parser) {
203 final childLines = parseChildLines(parser);
204
205 // The Markdown tests expect a trailing newline.
206 childLines.add('');
207
208 // Escape the code.
209 final escaped = escapeHtml(Strings.join(childLines, '\n'));
210
211 return new Element('pre', [new Element.text('code', escaped)]);
212 }
213 }
214
215 /// Parses horizontal rules like `---`, `_ _ _`, `* * *`, etc.
216 class HorizontalRuleSyntax extends BlockSyntax {
217 RegExp get pattern() => _RE_HR;
218
219 Node parse(BlockParser parser) {
220 final match = pattern.firstMatch(parser.current);
221 parser.advance();
222 return new Element.empty('hr');
223 }
224 }
225
226 /// Parses inline HTML at the block level. This differs from other markdown
227 /// implementations in several ways:
228 ///
229 /// 1. This one is way way WAY simpler.
230 /// 2. All HTML tags at the block level will be treated as blocks. If you
231 /// start a paragraph with `<em>`, it will not wrap it in a `<p>` for you.
232 /// As soon as it sees something like HTML, it stops mucking with it until
233 /// it hits the next block.
234 /// 3. Absolutely no HTML parsing or validation is done. We're a markdown
235 /// parser not an HTML parser!
236 class BlockHtmlSyntax extends BlockSyntax {
237 RegExp get pattern() => _RE_HTML;
238
239 bool get canEndBlock() => false;
240
241 Node parse(BlockParser parser) {
242 final childLines = [];
243
244 // Eat until we hit a blank line.
245 while (!parser.isDone && !parser.matches(_RE_EMPTY)) {
246 childLines.add(parser.current);
247 parser.advance();
248 }
249
250 return new Text(Strings.join(childLines, '\n'));
251 }
252 }
253
254 class ListItem {
255 bool forceBlock = false;
256 final List<String> lines;
257
258 ListItem(this.lines);
259 }
260
261 /// Base class for both ordered and unordered lists.
262 class ListSyntax extends BlockSyntax {
263 bool get canEndBlock() => false;
264
265 abstract String get listTag();
266
267 Node parse(BlockParser parser) {
268 final items = <ListItem>[];
269 var childLines = <String>[];
270
271 endItem() {
272 if (childLines.length > 0) {
273 items.add(new ListItem(childLines));
274 childLines = <String>[];
275 }
276 }
277
278 var match;
279 tryMatch(RegExp pattern) {
280 match = pattern.firstMatch(parser.current);
281 return match != null;
282 }
283
284 bool afterEmpty = false;
285 while (!parser.isDone) {
286 if (tryMatch(_RE_EMPTY)) {
287 // Add a blank line to the current list item.
288 childLines.add('');
289 } else if (tryMatch(_RE_UL) || tryMatch(_RE_OL)) {
290 // End the current list item and start a new one.
291 endItem();
292 childLines.add(match[1]);
293 } else if (tryMatch(_RE_INDENT)) {
294 // Strip off indent and add to current item.
295 childLines.add(match[1]);
296 } else if (isAtBlockEnd(parser)) {
297 // Done with the list.
298 break;
299 } else {
300 // Anything else is paragraph text or other stuff that can be in a list
301 // item. However, if the previous item is a blank line, this means we're
302 // done with the list and are starting a new top-level paragraph.
303 if ((childLines.length > 0) && (childLines.last() == '')) break;
304 childLines.add(parser.current);
305 }
306 parser.advance();
307 }
308
309 endItem();
310
311 // Markdown, because it hates us, specifies two kinds of list items. If you
312 // have a list like:
313 //
314 // * one
315 // * two
316 //
317 // Then it will insert the conents of the lines directly in the <li>, like:
318 // <ul>
319 // <li>one</li>
320 // <li>two</li>
321 // <ul>
322 //
323 // If, however, there are blank lines between the items, each is wrapped in
324 // paragraphs:
325 //
326 // * one
327 //
328 // * two
329 //
330 // <ul>
331 // <li><p>one</p></li>
332 // <li><p>two</p></li>
333 // <ul>
334 //
335 // In other words, sometimes we parse the contents of a list item like a
336 // block, and sometimes line an inline. The rules our parser implements are:
337 //
338 // - If it has more than one line, it's a block.
339 // - If the line matches any block parser (BLOCKQUOTE, HEADER, HR, INDENT,
340 // UL, OL) it's a block. (This is for cases like "* > quote".)
341 // - If there was a blank line between this item and the previous one, it's
342 // a block.
343 // - If there was a blank line between this item and the next one, it's a
344 // block.
345 // - Otherwise, parse it as an inline.
346
347 // Remove any trailing empty lines and note which items are separated by
348 // empty lines. Do this before seeing which items are single-line so that
349 // trailing empty lines on the last item don't force it into being a block.
350 for (int i = 0; i < items.length; i++) {
351 for (int j = items[i].lines.length - 1; j > 0; j--) {
352 if (_RE_EMPTY.firstMatch(items[i].lines[j]) != null) {
353 // Found an empty line. Item and one after it are blocks.
354 if (i < items.length - 1) {
355 items[i].forceBlock = true;
356 items[i + 1].forceBlock = true;
357 }
358 items[i].lines.removeLast();
359 } else {
360 break;
361 }
362 }
363 }
364
365 // Convert the list items to Nodes.
366 final itemNodes = <Node>[];
367 for (final item in items) {
368 bool blockItem = item.forceBlock || (item.lines.length > 1);
369
370 // See if it matches some block parser.
371 final blocksInList = const [
372 _RE_BLOCKQUOTE,
373 _RE_HEADER,
374 _RE_HR,
375 _RE_INDENT,
376 _RE_UL,
377 _RE_OL
378 ];
379
380 if (!blockItem) {
381 for (final pattern in blocksInList) {
382 if (pattern.firstMatch(item.lines[0]) != null) {
383 blockItem = true;
384 break;
385 }
386 }
387 }
388
389 // Parse the item as a block or inline.
390 if (blockItem) {
391 // Block list item.
392 final children = parser.document.parseLines(item.lines);
393 itemNodes.add(new Element('li', children));
394 } else {
395 // Raw list item.
396 final contents = parser.document.parseInline(item.lines[0]);
397 itemNodes.add(new Element('li', contents));
398 }
399 }
400
401 return new Element(listTag, itemNodes);
402 }
403 }
404
405 /// Parses unordered lists.
406 class UnorderedListSyntax extends ListSyntax {
407 RegExp get pattern() => _RE_UL;
408 String get listTag() => 'ul';
409 }
410
411 /// Parses ordered lists.
412 class OrderedListSyntax extends ListSyntax {
413 RegExp get pattern() => _RE_OL;
414 String get listTag() => 'ol';
415 }
416
417 /// Parses paragraphs of regular text.
418 class ParagraphSyntax extends BlockSyntax {
419 bool get canEndBlock() => false;
420
421 bool canParse(BlockParser parser) => true;
422
423 Node parse(BlockParser parser) {
424 final childLines = [];
425
426 // Eat until we hit something that ends a paragraph.
427 while (!isAtBlockEnd(parser)) {
428 childLines.add(parser.current);
429 parser.advance();
430 }
431
432 final contents = parser.document.parseInline(
433 Strings.join(childLines, '\n'));
434 return new Element('p', contents);
435 }
436 }
OLDNEW
« no previous file with comments | « utils/markdown/ast.dart ('k') | utils/markdown/html_renderer.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698