OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. |
| 4 |
| 5 /// The line contains only whitespace or is empty. |
| 6 final _RE_EMPTY = const RegExp(@'^([ \t]*)$'); |
| 7 |
| 8 /// A series of "=" or "-" (on the next line) define setext-style headers. |
| 9 final _RE_SETEXT = const RegExp(@'^((=+)|(-+))$'); |
| 10 |
| 11 /// Leading (and trailing) "#" define atx-style headers. |
| 12 final _RE_HEADER = const RegExp(@'^(#{1,6})(.*?)#*$'); |
| 13 |
| 14 /// The line starts with ">" with one optional space after. |
| 15 final _RE_BLOCKQUOTE = const RegExp(@'^[ ]{0,3}>[ ]?(.*)$'); |
| 16 |
| 17 /// A line indented four spaces. Used for code blocks and lists. |
| 18 final _RE_INDENT = const RegExp(@'^(?: |\t)(.*)$'); |
| 19 |
| 20 /// Three or more hyphens, asterisks or underscores by themselves. Note that |
| 21 /// a line like "----" is valid as both HR and SETEXT. In case of a tie, |
| 22 /// SETEXT should win. |
| 23 final _RE_HR = const RegExp(@'^[ ]{0,3}((-+[ ]{0,2}){3,}|' + |
| 24 @'(_+[ ]{0,2}){3,}|' + |
| 25 @'(\*+[ ]{0,2}){3,})$'); |
| 26 |
| 27 /// Really hacky way to detect block-level embedded HTML. Just looks for |
| 28 /// "<somename". |
| 29 final _RE_HTML = const RegExp(@'^<[ ]*\w+[ >]'); |
| 30 |
| 31 /// A line starting with one of these markers: "-", "*", "+". May have up to |
| 32 /// three leading spaces before the marker and any number of spaces or tabs |
| 33 /// after. |
| 34 final _RE_UL = const RegExp(@'^[ ]{0,3}[*+-][ \t]+(.*)$'); |
| 35 |
| 36 /// A line starting with a number like "123.". May have up to three leading |
| 37 /// spaces before the marker and any number of spaces or tabs after. |
| 38 final _RE_OL = const RegExp(@'^[ ]{0,3}\d+\.[ \t]+(.*)$'); |
| 39 |
| 40 /// Maintains the internal state needed to parse a series of lines into blocks |
| 41 /// of markdown suitable for further inline parsing. |
| 42 class BlockParser { |
| 43 final List<String> lines; |
| 44 |
| 45 /// The markdown document this parser is parsing. |
| 46 final Document document; |
| 47 |
| 48 /// Index of the current line. |
| 49 int pos; |
| 50 |
| 51 BlockParser(this.lines, this.document) |
| 52 : pos = 0; |
| 53 |
| 54 /// Gets the current line. |
| 55 String get current() => lines[pos]; |
| 56 |
| 57 /// Gets the line after the current one or `null` if there is none. |
| 58 String get next() { |
| 59 // Don't read past the end. |
| 60 if (pos >= lines.length - 1) return null; |
| 61 return lines[pos + 1]; |
| 62 } |
| 63 |
| 64 void advance() => pos++; |
| 65 bool get isDone() => pos >= lines.length; |
| 66 |
| 67 /// Gets whether or not the current line matches the given pattern. |
| 68 bool matches(RegExp regex) { |
| 69 if (isDone) return false; |
| 70 return regex.firstMatch(current) != null; |
| 71 } |
| 72 |
| 73 /// Gets whether or not the current line matches the given pattern. |
| 74 bool matchesNext(RegExp regex) { |
| 75 if (next == null) return false; |
| 76 return regex.firstMatch(next) != null; |
| 77 } |
| 78 } |
| 79 |
| 80 class BlockSyntax { |
| 81 /// Gets the collection of built-in block parsers. To turn a series of lines |
| 82 /// into blocks, each of these will be tried in turn. Order matters here. |
| 83 static List<BlockSyntax> get syntaxes() { |
| 84 // Lazy initialize. |
| 85 if (_syntaxes == null) { |
| 86 _syntaxes = [ |
| 87 new EmptyBlockSyntax(), |
| 88 new BlockHtmlSyntax(), |
| 89 new SetextHeaderSyntax(), |
| 90 new HeaderSyntax(), |
| 91 new CodeBlockSyntax(), |
| 92 new BlockquoteSyntax(), |
| 93 new HorizontalRuleSyntax(), |
| 94 new UnorderedListSyntax(), |
| 95 new OrderedListSyntax(), |
| 96 new ParagraphSyntax() |
| 97 ]; |
| 98 } |
| 99 |
| 100 return _syntaxes; |
| 101 } |
| 102 |
| 103 static List<BlockSyntax> _syntaxes; |
| 104 |
| 105 /// Gets the regex used to identify the beginning of this block, if any. |
| 106 RegExp get pattern() => null; |
| 107 |
| 108 bool get canEndBlock() => true; |
| 109 |
| 110 bool canParse(BlockParser parser) { |
| 111 return pattern.firstMatch(parser.current) != null; |
| 112 } |
| 113 |
| 114 abstract Node parse(BlockParser parser); |
| 115 |
| 116 List<Node> parseChildLines(BlockParser parser) { |
| 117 // Grab all of the lines that form the blockquote, stripping off the ">". |
| 118 final childLines = []; |
| 119 |
| 120 while (!parser.isDone) { |
| 121 final match = pattern.firstMatch(parser.current); |
| 122 if (match == null) break; |
| 123 childLines.add(match.group(1)); |
| 124 parser.advance(); |
| 125 } |
| 126 |
| 127 return childLines; |
| 128 } |
| 129 |
| 130 /// Gets whether or not [parser]'s current line should end the previous block. |
| 131 static bool isAtBlockEnd(BlockParser parser) { |
| 132 if (parser.isDone) return true; |
| 133 return syntaxes.some((s) => s.canParse(parser) && s.canEndBlock); |
| 134 } |
| 135 } |
| 136 |
| 137 class EmptyBlockSyntax extends BlockSyntax { |
| 138 RegExp get pattern() => _RE_EMPTY; |
| 139 |
| 140 Node parse(BlockParser parser) { |
| 141 parser.advance(); |
| 142 |
| 143 // Don't actually emit anything. |
| 144 return null; |
| 145 } |
| 146 } |
| 147 |
| 148 /// Parses setext-style headers. |
| 149 class SetextHeaderSyntax extends BlockSyntax { |
| 150 bool canParse(BlockParser parser) { |
| 151 // Note: matches *next* line, not the current one. We're looking for the |
| 152 // underlining after this line. |
| 153 return parser.matchesNext(_RE_SETEXT); |
| 154 } |
| 155 |
| 156 Node parse(BlockParser parser) { |
| 157 final match = _RE_SETEXT.firstMatch(parser.next); |
| 158 |
| 159 final tag = (match.group(1)[0] == '=') ? 'h1' : 'h2'; |
| 160 final contents = parser.document.parseInline(parser.current); |
| 161 parser.advance(); |
| 162 parser.advance(); |
| 163 |
| 164 return new Element(tag, contents); |
| 165 } |
| 166 } |
| 167 |
| 168 /// Parses atx-style headers: "## Header ##". |
| 169 class HeaderSyntax extends BlockSyntax { |
| 170 RegExp get pattern() => _RE_HEADER; |
| 171 |
| 172 Node parse(BlockParser parser) { |
| 173 final match = pattern.firstMatch(parser.current); |
| 174 parser.advance(); |
| 175 final level = match.group(1).length; |
| 176 final contents = parser.document.parseInline(match.group(2).trim()); |
| 177 return new Element('h$level', contents); |
| 178 } |
| 179 } |
| 180 |
| 181 /// Parses email-style blockquotes: "> quote". |
| 182 class BlockquoteSyntax extends BlockSyntax { |
| 183 RegExp get pattern() => _RE_BLOCKQUOTE; |
| 184 |
| 185 Node parse(BlockParser parser) { |
| 186 final childLines = parseChildLines(parser); |
| 187 |
| 188 // Recursively parse the contents of the blockquote. |
| 189 final children = parser.document.parseLines(childLines); |
| 190 |
| 191 return new Element('blockquote', children); |
| 192 } |
| 193 } |
| 194 |
| 195 /// Parses preformatted code blocks that are indented four spaces. |
| 196 class CodeBlockSyntax extends BlockSyntax { |
| 197 RegExp get pattern() => _RE_INDENT; |
| 198 |
| 199 Node parse(BlockParser parser) { |
| 200 final childLines = parseChildLines(parser); |
| 201 |
| 202 // The Markdown tests expect a trailing newline. |
| 203 childLines.add(''); |
| 204 |
| 205 // Escape the code. |
| 206 final escaped = escapeHtml(Strings.join(childLines, '\n')); |
| 207 |
| 208 return new Element('pre', [new Element.text('code', escaped)]); |
| 209 } |
| 210 } |
| 211 |
| 212 /// Parses horizontal rules like "---", "_ _ _", "* * *", etc. |
| 213 class HorizontalRuleSyntax extends BlockSyntax { |
| 214 RegExp get pattern() => _RE_HR; |
| 215 |
| 216 Node parse(BlockParser parser) { |
| 217 final match = pattern.firstMatch(parser.current); |
| 218 parser.advance(); |
| 219 return new Element.empty('hr'); |
| 220 } |
| 221 } |
| 222 |
| 223 /// Parses inline HTML at the block level. This differs from other markdown |
| 224 /// implementations in several ways: |
| 225 /// |
| 226 /// 1. This one is way way WAY simpler. |
| 227 /// 2. All HTML tags at the block level will be treated as blocks. If you start |
| 228 /// a paragraph with <em>, it will not wrap it in a <p> for you. As soon as |
| 229 /// it sees something like HTML, it stops mucking with it until it hits the |
| 230 /// next block. |
| 231 /// 3. Absolutely no HTML parsing or validation is done. We're a markdown |
| 232 /// parser not an HTML parser! |
| 233 class BlockHtmlSyntax extends BlockSyntax { |
| 234 RegExp get pattern() => _RE_HTML; |
| 235 |
| 236 bool get canEndBlock() => false; |
| 237 |
| 238 Node parse(BlockParser parser) { |
| 239 final childLines = []; |
| 240 |
| 241 // Eat until we hit a blank line. |
| 242 while (!parser.isDone && !parser.matches(_RE_EMPTY)) { |
| 243 childLines.add(parser.current); |
| 244 parser.advance(); |
| 245 } |
| 246 |
| 247 return new Text(Strings.join(childLines, '\n')); |
| 248 } |
| 249 } |
| 250 |
| 251 class ListItem { |
| 252 bool forceBlock = false; |
| 253 final List<String> lines; |
| 254 |
| 255 ListItem(this.lines); |
| 256 } |
| 257 |
| 258 /// Base class for both ordered and unordered lists. |
| 259 class ListSyntax extends BlockSyntax { |
| 260 bool get canEndBlock() => false; |
| 261 |
| 262 abstract String get listTag(); |
| 263 |
| 264 Node parse(BlockParser parser) { |
| 265 final items = <ListItem>[]; |
| 266 var childLines = <String>[]; |
| 267 |
| 268 endItem() { |
| 269 if (childLines.length > 0) { |
| 270 items.add(new ListItem(childLines)); |
| 271 childLines = <String>[]; |
| 272 } |
| 273 } |
| 274 |
| 275 var match; |
| 276 tryMatch(RegExp pattern) { |
| 277 match = pattern.firstMatch(parser.current); |
| 278 return match != null; |
| 279 } |
| 280 |
| 281 bool afterEmpty = false; |
| 282 while (!parser.isDone) { |
| 283 if (tryMatch(_RE_EMPTY)) { |
| 284 // Add a blank line to the current list item. |
| 285 childLines.add(''); |
| 286 } else if (tryMatch(_RE_UL) || tryMatch(_RE_OL)) { |
| 287 // End the current list item and start a new one. |
| 288 endItem(); |
| 289 childLines.add(match.group(1)); |
| 290 } else if (tryMatch(_RE_INDENT)) { |
| 291 // Strip off indent and add to current item. |
| 292 childLines.add(match.group(1)); |
| 293 } else if (isAtBlockEnd(parser)) { |
| 294 // Done with the list. |
| 295 break; |
| 296 } else { |
| 297 // Anything else is paragraph text or other stuff that can be in a list |
| 298 // item. However, if the previous item is a blank line, this means we're |
| 299 // done with the list and are starting a new top-level paragraph. |
| 300 if ((childLines.length > 0) && (childLines.last() == '')) break; |
| 301 childLines.add(parser.current); |
| 302 } |
| 303 parser.advance(); |
| 304 } |
| 305 |
| 306 endItem(); |
| 307 |
| 308 // Markdown, because it hates us, specifies two kinds of list items. If you |
| 309 // have a list like: |
| 310 // |
| 311 // * one |
| 312 // * two |
| 313 // |
| 314 // Then it will insert the conents of the lines directly in the <li>, like: |
| 315 // <ul> |
| 316 // <li>one</li> |
| 317 // <li>two</li> |
| 318 // <ul> |
| 319 // |
| 320 // If, however, there are blank lines between the items, each is wrapped in |
| 321 // paragraphs: |
| 322 // |
| 323 // * one |
| 324 // |
| 325 // * two |
| 326 // |
| 327 // <ul> |
| 328 // <li><p>one</p></li> |
| 329 // <li><p>two</p></li> |
| 330 // <ul> |
| 331 // |
| 332 // In other words, sometimes we parse the contents of a list item like a |
| 333 // block, and sometimes line an inline. The rules our parser implements are: |
| 334 // |
| 335 // - If it has more than one line, it's a block. |
| 336 // - If the line matches any block parser (BLOCKQUOTE, HEADER, HR, INDENT, |
| 337 // UL, OL) it's a block. (This is for cases like "* > quote".) |
| 338 // - If there was a blank line between this item and the previous one, it's |
| 339 // a block. |
| 340 // - If there was a blank line between this item and the next one, it's a |
| 341 // block. |
| 342 // - Otherwise, parse it as an inline. |
| 343 |
| 344 // Remove any trailing empty lines and note which items are separated by |
| 345 // empty lines. Do this before seeing which items are single-line so that |
| 346 // trailing empty lines on the last item don't force it into being a block. |
| 347 for (int i = 0; i < items.length; i++) { |
| 348 for (int j = items[i].lines.length - 1; j > 0; j--) { |
| 349 if (_RE_EMPTY.firstMatch(items[i].lines[j]) != null) { |
| 350 // Found an empty line. Item and one after it are blocks. |
| 351 if (i < items.length - 1) { |
| 352 items[i].forceBlock = true; |
| 353 items[i + 1].forceBlock = true; |
| 354 } |
| 355 items[i].lines.removeLast(); |
| 356 } else { |
| 357 break; |
| 358 } |
| 359 } |
| 360 } |
| 361 |
| 362 // Convert the list items to Nodes. |
| 363 final itemNodes = <Node>[]; |
| 364 for (final item in items) { |
| 365 bool blockItem = item.forceBlock || (item.lines.length > 1); |
| 366 |
| 367 // See if it matches some block parser. |
| 368 final blocksInList = const [ |
| 369 _RE_BLOCKQUOTE, |
| 370 _RE_HEADER, |
| 371 _RE_HR, |
| 372 _RE_INDENT, |
| 373 _RE_UL, |
| 374 _RE_OL |
| 375 ]; |
| 376 |
| 377 if (!blockItem) { |
| 378 for (final pattern in blocksInList) { |
| 379 if (pattern.firstMatch(item.lines[0]) != null) { |
| 380 blockItem = true; |
| 381 break; |
| 382 } |
| 383 } |
| 384 } |
| 385 |
| 386 // Parse the item as a block or inline. |
| 387 if (blockItem) { |
| 388 // Block list item. |
| 389 final children = parser.document.parseLines(item.lines); |
| 390 itemNodes.add(new Element('li', children)); |
| 391 } else { |
| 392 // Raw list item. |
| 393 final contents = parser.document.parseInline(item.lines[0]); |
| 394 itemNodes.add(new Element('li', contents)); |
| 395 } |
| 396 } |
| 397 |
| 398 return new Element(listTag, itemNodes); |
| 399 } |
| 400 } |
| 401 |
| 402 /// Parses unordered lists. |
| 403 class UnorderedListSyntax extends ListSyntax { |
| 404 RegExp get pattern() => _RE_UL; |
| 405 String get listTag() => 'ul'; |
| 406 } |
| 407 |
| 408 /// Parses ordered lists. |
| 409 class OrderedListSyntax extends ListSyntax { |
| 410 RegExp get pattern() => _RE_OL; |
| 411 String get listTag() => 'ol'; |
| 412 } |
| 413 |
| 414 /// Parses paragraphs of regular text. |
| 415 class ParagraphSyntax extends BlockSyntax { |
| 416 bool get canEndBlock() => false; |
| 417 |
| 418 bool canParse(BlockParser parser) => true; |
| 419 |
| 420 Node parse(BlockParser parser) { |
| 421 final childLines = []; |
| 422 |
| 423 // Eat until we hit something that ends a paragraph. |
| 424 while (!isAtBlockEnd(parser)) { |
| 425 childLines.add(parser.current); |
| 426 parser.advance(); |
| 427 } |
| 428 |
| 429 final contents = parser.document.parseInline( |
| 430 Strings.join(childLines, '\n')); |
| 431 return new Element('p', contents); |
| 432 } |
| 433 } |
OLD | NEW |