OLD | NEW |
| (Empty) |
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 /// The line contains only whitespace or is empty. | |
6 final _RE_EMPTY = const RegExp(@'^([ \t]*)$'); | |
7 | |
8 /// A series of `=` or `-` (on the next line) define setext-style headers. | |
9 final _RE_SETEXT = const RegExp(@'^((=+)|(-+))$'); | |
10 | |
11 /// Leading (and trailing) `#` define atx-style headers. | |
12 final _RE_HEADER = const RegExp(@'^(#{1,6})(.*?)#*$'); | |
13 | |
14 /// The line starts with `>` with one optional space after. | |
15 final _RE_BLOCKQUOTE = const RegExp(@'^[ ]{0,3}>[ ]?(.*)$'); | |
16 | |
17 /// A line indented four spaces. Used for code blocks and lists. | |
18 final _RE_INDENT = const RegExp(@'^(?: |\t)(.*)$'); | |
19 | |
20 /// Three or more hyphens, asterisks or underscores by themselves. Note that | |
21 /// a line like `----` is valid as both HR and SETEXT. In case of a tie, | |
22 /// SETEXT should win. | |
23 final _RE_HR = const RegExp(@'^[ ]{0,3}((-+[ ]{0,2}){3,}|' + | |
24 @'(_+[ ]{0,2}){3,}|' + | |
25 @'(\*+[ ]{0,2}){3,})$'); | |
26 | |
27 /// Really hacky way to detect block-level embedded HTML. Just looks for | |
28 /// "<somename". | |
29 final _RE_HTML = const RegExp(@'^<[ ]*\w+[ >]'); | |
30 | |
31 /// A line starting with one of these markers: `-`, `*`, `+`. May have up to | |
32 /// three leading spaces before the marker and any number of spaces or tabs | |
33 /// after. | |
34 final _RE_UL = const RegExp(@'^[ ]{0,3}[*+-][ \t]+(.*)$'); | |
35 | |
36 /// A line starting with a number like `123.`. May have up to three leading | |
37 /// spaces before the marker and any number of spaces or tabs after. | |
38 final _RE_OL = const RegExp(@'^[ ]{0,3}\d+\.[ \t]+(.*)$'); | |
39 | |
40 /// Maintains the internal state needed to parse a series of lines into blocks | |
41 /// of markdown suitable for further inline parsing. | |
42 class BlockParser { | |
43 final List<String> lines; | |
44 | |
45 /// The markdown document this parser is parsing. | |
46 final Document document; | |
47 | |
48 /// Index of the current line. | |
49 int pos; | |
50 | |
51 BlockParser(this.lines, this.document) | |
52 : pos = 0; | |
53 | |
54 /// Gets the current line. | |
55 String get current() => lines[pos]; | |
56 | |
57 /// Gets the line after the current one or `null` if there is none. | |
58 String get next() { | |
59 // Don't read past the end. | |
60 if (pos >= lines.length - 1) return null; | |
61 return lines[pos + 1]; | |
62 } | |
63 | |
64 void advance() { | |
65 pos++; | |
66 } | |
67 | |
68 bool get isDone() => pos >= lines.length; | |
69 | |
70 /// Gets whether or not the current line matches the given pattern. | |
71 bool matches(RegExp regex) { | |
72 if (isDone) return false; | |
73 return regex.firstMatch(current) != null; | |
74 } | |
75 | |
76 /// Gets whether or not the current line matches the given pattern. | |
77 bool matchesNext(RegExp regex) { | |
78 if (next == null) return false; | |
79 return regex.firstMatch(next) != null; | |
80 } | |
81 } | |
82 | |
83 class BlockSyntax { | |
84 /// Gets the collection of built-in block parsers. To turn a series of lines | |
85 /// into blocks, each of these will be tried in turn. Order matters here. | |
86 static List<BlockSyntax> get syntaxes() { | |
87 // Lazy initialize. | |
88 if (_syntaxes == null) { | |
89 _syntaxes = [ | |
90 new EmptyBlockSyntax(), | |
91 new BlockHtmlSyntax(), | |
92 new SetextHeaderSyntax(), | |
93 new HeaderSyntax(), | |
94 new CodeBlockSyntax(), | |
95 new BlockquoteSyntax(), | |
96 new HorizontalRuleSyntax(), | |
97 new UnorderedListSyntax(), | |
98 new OrderedListSyntax(), | |
99 new ParagraphSyntax() | |
100 ]; | |
101 } | |
102 | |
103 return _syntaxes; | |
104 } | |
105 | |
106 static List<BlockSyntax> _syntaxes; | |
107 | |
108 /// Gets the regex used to identify the beginning of this block, if any. | |
109 RegExp get pattern() => null; | |
110 | |
111 bool get canEndBlock() => true; | |
112 | |
113 bool canParse(BlockParser parser) { | |
114 return pattern.firstMatch(parser.current) != null; | |
115 } | |
116 | |
117 abstract Node parse(BlockParser parser); | |
118 | |
119 List<String> parseChildLines(BlockParser parser) { | |
120 // Grab all of the lines that form the blockquote, stripping off the ">". | |
121 final childLines = <String>[]; | |
122 | |
123 while (!parser.isDone) { | |
124 final match = pattern.firstMatch(parser.current); | |
125 if (match == null) break; | |
126 childLines.add(match[1]); | |
127 parser.advance(); | |
128 } | |
129 | |
130 return childLines; | |
131 } | |
132 | |
133 /// Gets whether or not [parser]'s current line should end the previous block. | |
134 static bool isAtBlockEnd(BlockParser parser) { | |
135 if (parser.isDone) return true; | |
136 return syntaxes.some((s) => s.canParse(parser) && s.canEndBlock); | |
137 } | |
138 } | |
139 | |
140 class EmptyBlockSyntax extends BlockSyntax { | |
141 RegExp get pattern() => _RE_EMPTY; | |
142 | |
143 Node parse(BlockParser parser) { | |
144 parser.advance(); | |
145 | |
146 // Don't actually emit anything. | |
147 return null; | |
148 } | |
149 } | |
150 | |
151 /// Parses setext-style headers. | |
152 class SetextHeaderSyntax extends BlockSyntax { | |
153 bool canParse(BlockParser parser) { | |
154 // Note: matches *next* line, not the current one. We're looking for the | |
155 // underlining after this line. | |
156 return parser.matchesNext(_RE_SETEXT); | |
157 } | |
158 | |
159 Node parse(BlockParser parser) { | |
160 final match = _RE_SETEXT.firstMatch(parser.next); | |
161 | |
162 final tag = (match[1][0] == '=') ? 'h1' : 'h2'; | |
163 final contents = parser.document.parseInline(parser.current); | |
164 parser.advance(); | |
165 parser.advance(); | |
166 | |
167 return new Element(tag, contents); | |
168 } | |
169 } | |
170 | |
171 /// Parses atx-style headers: `## Header ##`. | |
172 class HeaderSyntax extends BlockSyntax { | |
173 RegExp get pattern() => _RE_HEADER; | |
174 | |
175 Node parse(BlockParser parser) { | |
176 final match = pattern.firstMatch(parser.current); | |
177 parser.advance(); | |
178 final level = match[1].length; | |
179 final contents = parser.document.parseInline(match[2].trim()); | |
180 return new Element('h$level', contents); | |
181 } | |
182 } | |
183 | |
184 /// Parses email-style blockquotes: `> quote`. | |
185 class BlockquoteSyntax extends BlockSyntax { | |
186 RegExp get pattern() => _RE_BLOCKQUOTE; | |
187 | |
188 Node parse(BlockParser parser) { | |
189 final childLines = parseChildLines(parser); | |
190 | |
191 // Recursively parse the contents of the blockquote. | |
192 final children = parser.document.parseLines(childLines); | |
193 | |
194 return new Element('blockquote', children); | |
195 } | |
196 } | |
197 | |
198 /// Parses preformatted code blocks that are indented four spaces. | |
199 class CodeBlockSyntax extends BlockSyntax { | |
200 RegExp get pattern() => _RE_INDENT; | |
201 | |
202 Node parse(BlockParser parser) { | |
203 final childLines = parseChildLines(parser); | |
204 | |
205 // The Markdown tests expect a trailing newline. | |
206 childLines.add(''); | |
207 | |
208 // Escape the code. | |
209 final escaped = escapeHtml(Strings.join(childLines, '\n')); | |
210 | |
211 return new Element('pre', [new Element.text('code', escaped)]); | |
212 } | |
213 } | |
214 | |
215 /// Parses horizontal rules like `---`, `_ _ _`, `* * *`, etc. | |
216 class HorizontalRuleSyntax extends BlockSyntax { | |
217 RegExp get pattern() => _RE_HR; | |
218 | |
219 Node parse(BlockParser parser) { | |
220 final match = pattern.firstMatch(parser.current); | |
221 parser.advance(); | |
222 return new Element.empty('hr'); | |
223 } | |
224 } | |
225 | |
226 /// Parses inline HTML at the block level. This differs from other markdown | |
227 /// implementations in several ways: | |
228 /// | |
229 /// 1. This one is way way WAY simpler. | |
230 /// 2. All HTML tags at the block level will be treated as blocks. If you | |
231 /// start a paragraph with `<em>`, it will not wrap it in a `<p>` for you. | |
232 /// As soon as it sees something like HTML, it stops mucking with it until | |
233 /// it hits the next block. | |
234 /// 3. Absolutely no HTML parsing or validation is done. We're a markdown | |
235 /// parser not an HTML parser! | |
236 class BlockHtmlSyntax extends BlockSyntax { | |
237 RegExp get pattern() => _RE_HTML; | |
238 | |
239 bool get canEndBlock() => false; | |
240 | |
241 Node parse(BlockParser parser) { | |
242 final childLines = []; | |
243 | |
244 // Eat until we hit a blank line. | |
245 while (!parser.isDone && !parser.matches(_RE_EMPTY)) { | |
246 childLines.add(parser.current); | |
247 parser.advance(); | |
248 } | |
249 | |
250 return new Text(Strings.join(childLines, '\n')); | |
251 } | |
252 } | |
253 | |
254 class ListItem { | |
255 bool forceBlock = false; | |
256 final List<String> lines; | |
257 | |
258 ListItem(this.lines); | |
259 } | |
260 | |
261 /// Base class for both ordered and unordered lists. | |
262 class ListSyntax extends BlockSyntax { | |
263 bool get canEndBlock() => false; | |
264 | |
265 abstract String get listTag(); | |
266 | |
267 Node parse(BlockParser parser) { | |
268 final items = <ListItem>[]; | |
269 var childLines = <String>[]; | |
270 | |
271 endItem() { | |
272 if (childLines.length > 0) { | |
273 items.add(new ListItem(childLines)); | |
274 childLines = <String>[]; | |
275 } | |
276 } | |
277 | |
278 var match; | |
279 tryMatch(RegExp pattern) { | |
280 match = pattern.firstMatch(parser.current); | |
281 return match != null; | |
282 } | |
283 | |
284 bool afterEmpty = false; | |
285 while (!parser.isDone) { | |
286 if (tryMatch(_RE_EMPTY)) { | |
287 // Add a blank line to the current list item. | |
288 childLines.add(''); | |
289 } else if (tryMatch(_RE_UL) || tryMatch(_RE_OL)) { | |
290 // End the current list item and start a new one. | |
291 endItem(); | |
292 childLines.add(match[1]); | |
293 } else if (tryMatch(_RE_INDENT)) { | |
294 // Strip off indent and add to current item. | |
295 childLines.add(match[1]); | |
296 } else if (isAtBlockEnd(parser)) { | |
297 // Done with the list. | |
298 break; | |
299 } else { | |
300 // Anything else is paragraph text or other stuff that can be in a list | |
301 // item. However, if the previous item is a blank line, this means we're | |
302 // done with the list and are starting a new top-level paragraph. | |
303 if ((childLines.length > 0) && (childLines.last() == '')) break; | |
304 childLines.add(parser.current); | |
305 } | |
306 parser.advance(); | |
307 } | |
308 | |
309 endItem(); | |
310 | |
311 // Markdown, because it hates us, specifies two kinds of list items. If you | |
312 // have a list like: | |
313 // | |
314 // * one | |
315 // * two | |
316 // | |
317 // Then it will insert the conents of the lines directly in the <li>, like: | |
318 // <ul> | |
319 // <li>one</li> | |
320 // <li>two</li> | |
321 // <ul> | |
322 // | |
323 // If, however, there are blank lines between the items, each is wrapped in | |
324 // paragraphs: | |
325 // | |
326 // * one | |
327 // | |
328 // * two | |
329 // | |
330 // <ul> | |
331 // <li><p>one</p></li> | |
332 // <li><p>two</p></li> | |
333 // <ul> | |
334 // | |
335 // In other words, sometimes we parse the contents of a list item like a | |
336 // block, and sometimes line an inline. The rules our parser implements are: | |
337 // | |
338 // - If it has more than one line, it's a block. | |
339 // - If the line matches any block parser (BLOCKQUOTE, HEADER, HR, INDENT, | |
340 // UL, OL) it's a block. (This is for cases like "* > quote".) | |
341 // - If there was a blank line between this item and the previous one, it's | |
342 // a block. | |
343 // - If there was a blank line between this item and the next one, it's a | |
344 // block. | |
345 // - Otherwise, parse it as an inline. | |
346 | |
347 // Remove any trailing empty lines and note which items are separated by | |
348 // empty lines. Do this before seeing which items are single-line so that | |
349 // trailing empty lines on the last item don't force it into being a block. | |
350 for (int i = 0; i < items.length; i++) { | |
351 for (int j = items[i].lines.length - 1; j > 0; j--) { | |
352 if (_RE_EMPTY.firstMatch(items[i].lines[j]) != null) { | |
353 // Found an empty line. Item and one after it are blocks. | |
354 if (i < items.length - 1) { | |
355 items[i].forceBlock = true; | |
356 items[i + 1].forceBlock = true; | |
357 } | |
358 items[i].lines.removeLast(); | |
359 } else { | |
360 break; | |
361 } | |
362 } | |
363 } | |
364 | |
365 // Convert the list items to Nodes. | |
366 final itemNodes = <Node>[]; | |
367 for (final item in items) { | |
368 bool blockItem = item.forceBlock || (item.lines.length > 1); | |
369 | |
370 // See if it matches some block parser. | |
371 final blocksInList = const [ | |
372 _RE_BLOCKQUOTE, | |
373 _RE_HEADER, | |
374 _RE_HR, | |
375 _RE_INDENT, | |
376 _RE_UL, | |
377 _RE_OL | |
378 ]; | |
379 | |
380 if (!blockItem) { | |
381 for (final pattern in blocksInList) { | |
382 if (pattern.firstMatch(item.lines[0]) != null) { | |
383 blockItem = true; | |
384 break; | |
385 } | |
386 } | |
387 } | |
388 | |
389 // Parse the item as a block or inline. | |
390 if (blockItem) { | |
391 // Block list item. | |
392 final children = parser.document.parseLines(item.lines); | |
393 itemNodes.add(new Element('li', children)); | |
394 } else { | |
395 // Raw list item. | |
396 final contents = parser.document.parseInline(item.lines[0]); | |
397 itemNodes.add(new Element('li', contents)); | |
398 } | |
399 } | |
400 | |
401 return new Element(listTag, itemNodes); | |
402 } | |
403 } | |
404 | |
405 /// Parses unordered lists. | |
406 class UnorderedListSyntax extends ListSyntax { | |
407 RegExp get pattern() => _RE_UL; | |
408 String get listTag() => 'ul'; | |
409 } | |
410 | |
411 /// Parses ordered lists. | |
412 class OrderedListSyntax extends ListSyntax { | |
413 RegExp get pattern() => _RE_OL; | |
414 String get listTag() => 'ol'; | |
415 } | |
416 | |
417 /// Parses paragraphs of regular text. | |
418 class ParagraphSyntax extends BlockSyntax { | |
419 bool get canEndBlock() => false; | |
420 | |
421 bool canParse(BlockParser parser) => true; | |
422 | |
423 Node parse(BlockParser parser) { | |
424 final childLines = []; | |
425 | |
426 // Eat until we hit something that ends a paragraph. | |
427 while (!isAtBlockEnd(parser)) { | |
428 childLines.add(parser.current); | |
429 parser.advance(); | |
430 } | |
431 | |
432 final contents = parser.document.parseInline( | |
433 Strings.join(childLines, '\n')); | |
434 return new Element('p', contents); | |
435 } | |
436 } | |
OLD | NEW |