OLD | NEW |
---|---|
(Empty) | |
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 class _Re { | |
Jennifer Messerly
2011/11/23 22:25:41
what is this Java doing in my Dart code? :)
Serio
Bob Nystrom
2011/11/29 02:56:29
Reorganized. I don't know what I was thinking. The
| |
6 /// The line contains only whitespace or is empty. | |
7 static final EMPTY = const RegExp(@'^([ \t]*)$'); | |
8 | |
9 /// A series of "=" or "-" (on the next line) define setext-style headers. | |
10 static final SETEXT = const RegExp(@'^((=+)|(-+))$'); | |
11 | |
12 /// Leading (and trailing) "#" define atx-style headers. | |
13 static final HEADER = const RegExp(@'^(#{1,6})(.*?)#*$'); | |
14 | |
15 /// The line starts with ">" with one optional space after. | |
16 static final BLOCKQUOTE = const RegExp(@'^[ ]{0,3}>[ ]?(.*)$'); | |
17 | |
18 /// A line indented four spaces. Used for code blocks and lists. | |
19 static final INDENT = const RegExp(@'^(?: |\t)(.*)$'); | |
20 | |
21 /// Three or more hyphens, asterisks or underscores by themselves. Note that | |
22 /// a line like "----" is valid as both HR and SETEXT. In case of a tie, | |
23 /// SETEXT should win. | |
24 static final HR = const RegExp(@'^[ ]{0,3}((-+[ ]{0,2}){3,}|' + | |
25 @'(_+[ ]{0,2}){3,}|' + | |
26 @'(\*+[ ]{0,2}){3,})$'); | |
27 | |
28 /// Really hacky way to detect block-level embedded HTML. Just looks for | |
29 /// "<somename". | |
30 static final HTML = const RegExp(@'^<[ ]*\w+[ >]'); | |
31 | |
32 /// A line starting with one of these markers: "-", "*", "+". May have up to | |
33 /// three leading spaces before the marker and any number of spaces or tabs | |
34 /// after. | |
35 static final UL = const RegExp(@'^[ ]{0,3}[*+-][ \t]+(.*)$'); | |
36 | |
37 /// A line starting with a number like "123.". May have up to three leading | |
38 /// spaces before the marker and any number of spaces or tabs after. | |
39 static final OL = const RegExp(@'^[ ]{0,3}\d+\.[ \t]+(.*)$'); | |
40 | |
41 /// These patterns when appearing in a single-line list item will force the | |
42 /// item to be parsed as a block. | |
43 static final BLOCKS_IN_LIST = const [BLOCKQUOTE, HEADER, HR, INDENT, UL, OL]; | |
44 } | |
45 | |
46 /// Maintains the internal state needed to parse a series of lines into blocks | |
47 /// of markdown suitable for further inline parsing. | |
48 class BlockParser { | |
49 final List<String> lines; | |
50 | |
51 /// The markdown document this parser is parsing. | |
52 final Document document; | |
53 | |
54 /// Index of the current line. | |
55 int pos; | |
56 | |
57 BlockParser(this.lines, this.document) | |
58 : pos = 0; | |
59 | |
60 /// Gets the current line. | |
61 String get current() => lines[pos]; | |
62 | |
63 /// Gets the line after the current one or `null` if there is none. | |
64 String get next() { | |
65 // Don't read past the end. | |
66 if (pos >= lines.length - 1) return null; | |
67 return lines[pos + 1]; | |
68 } | |
69 | |
70 void advance() => pos++; | |
71 bool get isDone() => pos >= lines.length; | |
72 | |
73 /// Gets whether or not the current line matches the given pattern. | |
74 bool matches(RegExp regex) { | |
75 if (isDone) return false; | |
76 return regex.firstMatch(current) != null; | |
77 } | |
78 | |
79 /// Gets whether or not the current line matches the given pattern. | |
80 bool matchesNext(RegExp regex) { | |
81 if (next == null) return false; | |
82 return regex.firstMatch(next) != null; | |
83 } | |
84 } | |
85 | |
86 class BlockSyntax { | |
87 /// Gets the collection of built-in block parsers. To turn a series of lines | |
88 /// into blocks, each of these will be tried in turn. Order matters here. | |
89 static List<BlockSyntax> get syntaxes() { | |
90 // Lazy initialize. | |
91 if (_syntaxes == null) { | |
92 _syntaxes = [ | |
93 new EmptyBlockSyntax(), | |
Jennifer Messerly
2011/11/23 22:25:41
could these be const, and then use a "static final
Bob Nystrom
2011/11/29 02:56:29
They could be, but I'm thinking users may be able
| |
94 new BlockHtmlSyntax(), | |
95 new SetextHeaderSyntax(), | |
96 new HeaderSyntax(), | |
97 new CodeBlockSyntax(), | |
98 new BlockquoteSyntax(), | |
99 new HorizontalRuleSyntax(), | |
100 new UnorderedListSyntax(), | |
101 new OrderedListSyntax(), | |
102 new ParagraphSyntax() | |
103 ]; | |
104 } | |
105 | |
106 return _syntaxes; | |
107 } | |
108 | |
109 static List<BlockSyntax> _syntaxes; | |
110 | |
111 /// Gets the regex used to identify the beginning of this block, if any. | |
112 RegExp get pattern() => null; | |
113 | |
114 bool get canEndBlock() => true; | |
115 | |
116 bool canParse(BlockParser parser) { | |
117 return pattern.firstMatch(parser.current) != null; | |
118 } | |
119 | |
120 abstract Node parse(BlockParser parser); | |
121 | |
122 List<Node> parseChildLines(BlockParser parser) { | |
123 // Grab all of the lines that form the blockquote, stripping off the ">". | |
124 final childLines = []; | |
125 | |
126 while (!parser.isDone) { | |
127 final match = pattern.firstMatch(parser.current); | |
128 if (match == null) break; | |
129 childLines.add(match.group(1)); | |
130 parser.advance(); | |
131 } | |
132 | |
133 return childLines; | |
134 } | |
135 | |
136 /// Gets whether or not [parser]'s current line should end the previous block. | |
137 static bool isAtBlockEnd(BlockParser parser) { | |
138 if (parser.isDone) return true; | |
139 return syntaxes.some((s) => s.canParse(parser) && s.canEndBlock); | |
140 } | |
141 } | |
142 | |
143 class EmptyBlockSyntax extends BlockSyntax { | |
144 RegExp get pattern() => _Re.EMPTY; | |
145 | |
146 Node parse(BlockParser parser) { | |
147 parser.advance(); | |
148 | |
149 // Don't actually emit anything. | |
150 return null; | |
151 } | |
152 } | |
153 | |
154 /// Parses setext-style headers. | |
155 class SetextHeaderSyntax extends BlockSyntax { | |
156 bool canParse(BlockParser parser) { | |
157 // Note: matches *next* line, not the current one. We're looking for the | |
158 // underlining after this line. | |
159 return parser.matchesNext(_Re.SETEXT); | |
160 } | |
161 | |
162 Node parse(BlockParser parser) { | |
163 final match = _Re.SETEXT.firstMatch(parser.next); | |
164 | |
165 final tag = (match.group(1)[0] == '=') ? 'h1' : 'h2'; | |
166 final contents = parser.document.parseInline(parser.current); | |
167 parser.advance(); | |
168 parser.advance(); | |
169 | |
170 return new Element(tag, contents); | |
171 } | |
172 } | |
173 | |
174 /// Parses atx-style headers: "## Header ##". | |
175 class HeaderSyntax extends BlockSyntax { | |
176 RegExp get pattern() => _Re.HEADER; | |
177 | |
178 Node parse(BlockParser parser) { | |
179 final match = pattern.firstMatch(parser.current); | |
180 parser.advance(); | |
181 final level = match.group(1).length; | |
182 final contents = parser.document.parseInline(match.group(2).trim()); | |
183 return new Element('h$level', contents); | |
184 } | |
185 } | |
186 | |
187 /// Parses email-style blockquotes: "> quote". | |
188 class BlockquoteSyntax extends BlockSyntax { | |
189 RegExp get pattern() => _Re.BLOCKQUOTE; | |
190 | |
191 Node parse(BlockParser parser) { | |
192 final childLines = parseChildLines(parser); | |
193 | |
194 // Recursively parse the contents of the blockquote. | |
195 final children = parser.document.parseLines(childLines); | |
196 | |
197 return new Element('blockquote', children); | |
198 } | |
199 } | |
200 | |
201 /// Parses preformatted code blocks that are indented four spaces. | |
202 class CodeBlockSyntax extends BlockSyntax { | |
203 RegExp get pattern() => _Re.INDENT; | |
204 | |
205 Node parse(BlockParser parser) { | |
206 final childLines = parseChildLines(parser); | |
207 | |
208 // The Markdown tests expect a trailing newline. | |
209 childLines.add(''); | |
210 | |
211 // Escape the code. | |
212 final escaped = escapeHtml(Strings.join(childLines, '\n')); | |
213 | |
214 return new Element('pre', [new Element.text('code', escaped)]); | |
215 } | |
216 } | |
217 | |
218 /// Parses horizontal rules like "---", "_ _ _", "* * *", etc. | |
219 class HorizontalRuleSyntax extends BlockSyntax { | |
220 RegExp get pattern() => _Re.HR; | |
221 | |
222 Node parse(BlockParser parser) { | |
223 final match = pattern.firstMatch(parser.current); | |
224 parser.advance(); | |
225 return new Element.empty('hr'); | |
226 } | |
227 } | |
228 | |
229 /// Parses inline HTML at the block level. This differs from other markdown | |
230 /// implementations in several ways: | |
231 /// | |
232 /// 1. This one is way way WAY simpler. | |
233 /// 2. All HTML tags at the block level will be treated as blocks. If you start | |
234 /// a paragraph with <em>, it will not wrap it in a <p> for you. As soon as | |
235 /// it sees something like HTML, it stops mucking with it until it hits the | |
236 /// next block. | |
237 /// 3. Absolutely no HTML parsing or validation is done. We're a markdown | |
238 /// parser not an HTML parser! | |
239 class BlockHtmlSyntax extends BlockSyntax { | |
240 RegExp get pattern() => _Re.HTML; | |
241 | |
242 bool get canEndBlock() => false; | |
243 | |
244 Node parse(BlockParser parser) { | |
245 final childLines = []; | |
246 | |
247 // Eat until we hit a blank line. | |
248 while (!parser.isDone && !parser.matches(_Re.EMPTY)) { | |
249 childLines.add(parser.current); | |
250 parser.advance(); | |
251 } | |
252 | |
253 return new Text(Strings.join(childLines, '\n')); | |
254 } | |
255 } | |
256 | |
257 class ListItem { | |
258 bool forceBlock = false; | |
259 final List<String> lines; | |
260 | |
261 ListItem(this.lines); | |
262 } | |
263 | |
264 /// Base class for both ordered and unordered lists. | |
265 class ListSyntax extends BlockSyntax { | |
266 bool get canEndBlock() => false; | |
267 | |
268 abstract String get listTag(); | |
269 | |
270 Node parse(BlockParser parser) { | |
271 final items = <ListItem>[]; | |
272 var childLines = <String>[]; | |
273 | |
274 endItem() { | |
275 if (childLines.length > 0) { | |
276 items.add(new ListItem(childLines)); | |
277 childLines = <String>[]; | |
278 } | |
279 } | |
280 | |
281 var match; | |
282 tryMatch(RegExp pattern) { | |
283 match = pattern.firstMatch(parser.current); | |
284 return match != null; | |
285 } | |
286 | |
287 bool afterEmpty = false; | |
288 while (!parser.isDone) { | |
289 if (tryMatch(_Re.EMPTY)) { | |
290 // Add a blank line to the current list item. | |
291 childLines.add(''); | |
292 } else if (tryMatch(_Re.UL) || tryMatch(_Re.OL)) { | |
293 // End the current list item and start a new one. | |
294 endItem(); | |
295 childLines.add(match.group(1)); | |
296 } else if (tryMatch(_Re.INDENT)) { | |
297 // Strip off indent and add to current item. | |
298 childLines.add(match.group(1)); | |
299 } else if (isAtBlockEnd(parser)) { | |
300 // Done with the list. | |
301 break; | |
302 } else { | |
303 // Anything else is paragraph text or other stuff that can be in a list | |
304 // item. However, if the previous item is a blank line, this means we're | |
305 // done with the list and are starting a new top-level paragraph. | |
306 if ((childLines.length > 0) && (childLines.last() == '')) break; | |
307 childLines.add(parser.current); | |
308 } | |
309 parser.advance(); | |
310 } | |
311 | |
312 endItem(); | |
313 | |
314 // Markdown, because it hates us, specifies two kinds of list items. If you | |
315 // have a list like: | |
316 // | |
317 // * one | |
318 // * two | |
319 // | |
320 // Then it will insert the conents of the lines directly in the <li>, like: | |
321 // <ul> | |
322 // <li>one</li> | |
323 // <li>two</li> | |
324 // <ul> | |
325 // | |
326 // If, however, there are blank lines between the items, each is wrapped in | |
327 // paragraphs: | |
328 // | |
329 // * one | |
330 // | |
331 // * two | |
332 // | |
333 // <ul> | |
334 // <li><p>one</p></li> | |
335 // <li><p>two</p></li> | |
336 // <ul> | |
337 // | |
338 // In other words, sometimes we parse the contents of a list item like a | |
339 // block, and sometimes line an inline. The rules our parser implements are: | |
340 // | |
341 // - If it has more than one line, it's a block. | |
342 // - If the line matches any block parser (BLOCKQUOTE, HEADER, HR, INDENT, | |
343 // UL, OL) it's a block. (This is for cases like "* > quote".) | |
344 // - If there was a blank line between this item and the previous one, it's | |
345 // a block. | |
346 // - If there was a blank line between this item and the next one, it's a | |
347 // block. | |
348 // - Otherwise, parse it as an inline. | |
349 | |
350 // Remove any trailing empty lines and note which items are separated by | |
351 // empty lines. Do this before seeing which items are single-line so that | |
352 // trailing empty lines on the last item don't force it into being a block. | |
353 for (int i = 0; i < items.length; i++) { | |
354 for (int j = items[i].lines.length - 1; j > 0; j--) { | |
355 if (_Re.EMPTY.firstMatch(items[i].lines[j]) != null) { | |
356 // Found an empty line. Item and one after it are blocks. | |
357 if (i < items.length - 1) { | |
358 items[i].forceBlock = true; | |
359 items[i + 1].forceBlock = true; | |
360 } | |
361 items[i].lines.removeLast(); | |
362 } else { | |
363 break; | |
364 } | |
365 } | |
366 } | |
367 | |
368 // Convert the list items to Nodes. | |
369 final itemNodes = <Node>[]; | |
370 for (final item in items) { | |
371 bool blockItem = item.forceBlock || (item.lines.length > 1); | |
372 | |
373 // See if it matches some block parser. | |
374 if (!blockItem) { | |
375 for (final pattern in _Re.BLOCKS_IN_LIST) { | |
376 if (pattern.firstMatch(item.lines[0]) != null) { | |
377 blockItem = true; | |
378 break; | |
379 } | |
380 } | |
381 } | |
382 | |
383 // Parse the item as a block or inline. | |
384 if (blockItem) { | |
385 // Block list item. | |
386 final children = parser.document.parseLines(item.lines); | |
387 itemNodes.add(new Element('li', children)); | |
388 } else { | |
389 // Raw list item. | |
390 final contents = parser.document.parseInline(item.lines[0]); | |
391 itemNodes.add(new Element('li', contents)); | |
392 } | |
393 } | |
394 | |
395 return new Element(listTag, itemNodes); | |
396 } | |
397 } | |
398 | |
399 /// Parses unordered lists. | |
400 class UnorderedListSyntax extends ListSyntax { | |
401 RegExp get pattern() => _Re.UL; | |
402 String get listTag() => 'ul'; | |
403 } | |
404 | |
405 /// Parses ordered lists. | |
406 class OrderedListSyntax extends ListSyntax { | |
407 RegExp get pattern() => _Re.OL; | |
408 String get listTag() => 'ol'; | |
409 } | |
410 | |
411 /// Parses paragraphs of regular text. | |
412 class ParagraphSyntax extends BlockSyntax { | |
413 bool get canEndBlock() => false; | |
414 | |
415 bool canParse(BlockParser parser) => true; | |
416 | |
417 Node parse(BlockParser parser) { | |
418 final childLines = []; | |
419 | |
420 // Eat until we hit something that ends a paragraph. | |
421 while (!isAtBlockEnd(parser)) { | |
422 childLines.add(parser.current); | |
423 parser.advance(); | |
424 } | |
425 | |
426 final contents = parser.document.parseInline( | |
427 Strings.join(childLines, '\n')); | |
428 return new Element('p', contents); | |
429 } | |
430 } | |
OLD | NEW |