| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
| 2 // for details. All rights reserved. Use of this source code is governed by a | |
| 3 // BSD-style license that can be found in the LICENSE file. | |
| 4 | |
| 5 /// The line contains only whitespace or is empty. | |
| 6 const _RE_EMPTY = const RegExp(@'^([ \t]*)$'); | |
| 7 | |
| 8 /// A series of `=` or `-` (on the next line) define setext-style headers. | |
| 9 const _RE_SETEXT = const RegExp(@'^((=+)|(-+))$'); | |
| 10 | |
| 11 /// Leading (and trailing) `#` define atx-style headers. | |
| 12 const _RE_HEADER = const RegExp(@'^(#{1,6})(.*?)#*$'); | |
| 13 | |
| 14 /// The line starts with `>` with one optional space after. | |
| 15 const _RE_BLOCKQUOTE = const RegExp(@'^[ ]{0,3}>[ ]?(.*)$'); | |
| 16 | |
| 17 /// A line indented four spaces. Used for code blocks and lists. | |
| 18 const _RE_INDENT = const RegExp(@'^(?: |\t)(.*)$'); | |
| 19 | |
| 20 /// Three or more hyphens, asterisks or underscores by themselves. Note that | |
| 21 /// a line like `----` is valid as both HR and SETEXT. In case of a tie, | |
| 22 /// SETEXT should win. | |
| 23 const _RE_HR = const RegExp(@'^[ ]{0,3}((-+[ ]{0,2}){3,}|' | |
| 24 @'(_+[ ]{0,2}){3,}|' | |
| 25 @'(\*+[ ]{0,2}){3,})$'); | |
| 26 | |
| 27 /// Really hacky way to detect block-level embedded HTML. Just looks for | |
| 28 /// "<somename". | |
| 29 const _RE_HTML = const RegExp(@'^<[ ]*\w+[ >]'); | |
| 30 | |
| 31 /// A line starting with one of these markers: `-`, `*`, `+`. May have up to | |
| 32 /// three leading spaces before the marker and any number of spaces or tabs | |
| 33 /// after. | |
| 34 const _RE_UL = const RegExp(@'^[ ]{0,3}[*+-][ \t]+(.*)$'); | |
| 35 | |
| 36 /// A line starting with a number like `123.`. May have up to three leading | |
| 37 /// spaces before the marker and any number of spaces or tabs after. | |
| 38 const _RE_OL = const RegExp(@'^[ ]{0,3}\d+\.[ \t]+(.*)$'); | |
| 39 | |
| 40 /// Maintains the internal state needed to parse a series of lines into blocks | |
| 41 /// of markdown suitable for further inline parsing. | |
| 42 class BlockParser { | |
| 43 final List<String> lines; | |
| 44 | |
| 45 /// The markdown document this parser is parsing. | |
| 46 final Document document; | |
| 47 | |
| 48 /// Index of the current line. | |
| 49 int pos; | |
| 50 | |
| 51 BlockParser(this.lines, this.document) | |
| 52 : pos = 0; | |
| 53 | |
| 54 /// Gets the current line. | |
| 55 String get current => lines[pos]; | |
| 56 | |
| 57 /// Gets the line after the current one or `null` if there is none. | |
| 58 String get next { | |
| 59 // Don't read past the end. | |
| 60 if (pos >= lines.length - 1) return null; | |
| 61 return lines[pos + 1]; | |
| 62 } | |
| 63 | |
| 64 void advance() { | |
| 65 pos++; | |
| 66 } | |
| 67 | |
| 68 bool get isDone => pos >= lines.length; | |
| 69 | |
| 70 /// Gets whether or not the current line matches the given pattern. | |
| 71 bool matches(RegExp regex) { | |
| 72 if (isDone) return false; | |
| 73 return regex.firstMatch(current) != null; | |
| 74 } | |
| 75 | |
| 76 /// Gets whether or not the current line matches the given pattern. | |
| 77 bool matchesNext(RegExp regex) { | |
| 78 if (next == null) return false; | |
| 79 return regex.firstMatch(next) != null; | |
| 80 } | |
| 81 } | |
| 82 | |
| 83 class BlockSyntax { | |
| 84 /// Gets the collection of built-in block parsers. To turn a series of lines | |
| 85 /// into blocks, each of these will be tried in turn. Order matters here. | |
| 86 static List<BlockSyntax> get syntaxes { | |
| 87 // Lazy initialize. | |
| 88 if (_syntaxes == null) { | |
| 89 _syntaxes = [ | |
| 90 new EmptyBlockSyntax(), | |
| 91 new BlockHtmlSyntax(), | |
| 92 new SetextHeaderSyntax(), | |
| 93 new HeaderSyntax(), | |
| 94 new CodeBlockSyntax(), | |
| 95 new BlockquoteSyntax(), | |
| 96 new HorizontalRuleSyntax(), | |
| 97 new UnorderedListSyntax(), | |
| 98 new OrderedListSyntax(), | |
| 99 new ParagraphSyntax() | |
| 100 ]; | |
| 101 } | |
| 102 | |
| 103 return _syntaxes; | |
| 104 } | |
| 105 | |
| 106 static List<BlockSyntax> _syntaxes; | |
| 107 | |
| 108 /// Gets the regex used to identify the beginning of this block, if any. | |
| 109 RegExp get pattern => null; | |
| 110 | |
| 111 bool get canEndBlock => true; | |
| 112 | |
| 113 bool canParse(BlockParser parser) { | |
| 114 return pattern.firstMatch(parser.current) != null; | |
| 115 } | |
| 116 | |
| 117 abstract Node parse(BlockParser parser); | |
| 118 | |
| 119 List<String> parseChildLines(BlockParser parser) { | |
| 120 // Grab all of the lines that form the blockquote, stripping off the ">". | |
| 121 final childLines = <String>[]; | |
| 122 | |
| 123 while (!parser.isDone) { | |
| 124 final match = pattern.firstMatch(parser.current); | |
| 125 if (match == null) break; | |
| 126 childLines.add(match[1]); | |
| 127 parser.advance(); | |
| 128 } | |
| 129 | |
| 130 return childLines; | |
| 131 } | |
| 132 | |
| 133 /// Gets whether or not [parser]'s current line should end the previous block. | |
| 134 static bool isAtBlockEnd(BlockParser parser) { | |
| 135 if (parser.isDone) return true; | |
| 136 return syntaxes.some((s) => s.canParse(parser) && s.canEndBlock); | |
| 137 } | |
| 138 } | |
| 139 | |
| 140 class EmptyBlockSyntax extends BlockSyntax { | |
| 141 RegExp get pattern => _RE_EMPTY; | |
| 142 | |
| 143 Node parse(BlockParser parser) { | |
| 144 parser.advance(); | |
| 145 | |
| 146 // Don't actually emit anything. | |
| 147 return null; | |
| 148 } | |
| 149 } | |
| 150 | |
| 151 /// Parses setext-style headers. | |
| 152 class SetextHeaderSyntax extends BlockSyntax { | |
| 153 bool canParse(BlockParser parser) { | |
| 154 // Note: matches *next* line, not the current one. We're looking for the | |
| 155 // underlining after this line. | |
| 156 return parser.matchesNext(_RE_SETEXT); | |
| 157 } | |
| 158 | |
| 159 Node parse(BlockParser parser) { | |
| 160 final match = _RE_SETEXT.firstMatch(parser.next); | |
| 161 | |
| 162 final tag = (match[1][0] == '=') ? 'h1' : 'h2'; | |
| 163 final contents = parser.document.parseInline(parser.current); | |
| 164 parser.advance(); | |
| 165 parser.advance(); | |
| 166 | |
| 167 return new Element(tag, contents); | |
| 168 } | |
| 169 } | |
| 170 | |
| 171 /// Parses atx-style headers: `## Header ##`. | |
| 172 class HeaderSyntax extends BlockSyntax { | |
| 173 RegExp get pattern => _RE_HEADER; | |
| 174 | |
| 175 Node parse(BlockParser parser) { | |
| 176 final match = pattern.firstMatch(parser.current); | |
| 177 parser.advance(); | |
| 178 final level = match[1].length; | |
| 179 final contents = parser.document.parseInline(match[2].trim()); | |
| 180 return new Element('h$level', contents); | |
| 181 } | |
| 182 } | |
| 183 | |
| 184 /// Parses email-style blockquotes: `> quote`. | |
| 185 class BlockquoteSyntax extends BlockSyntax { | |
| 186 RegExp get pattern => _RE_BLOCKQUOTE; | |
| 187 | |
| 188 Node parse(BlockParser parser) { | |
| 189 final childLines = parseChildLines(parser); | |
| 190 | |
| 191 // Recursively parse the contents of the blockquote. | |
| 192 final children = parser.document.parseLines(childLines); | |
| 193 | |
| 194 return new Element('blockquote', children); | |
| 195 } | |
| 196 } | |
| 197 | |
| 198 /// Parses preformatted code blocks that are indented four spaces. | |
| 199 class CodeBlockSyntax extends BlockSyntax { | |
| 200 RegExp get pattern => _RE_INDENT; | |
| 201 | |
| 202 Node parse(BlockParser parser) { | |
| 203 final childLines = parseChildLines(parser); | |
| 204 | |
| 205 // The Markdown tests expect a trailing newline. | |
| 206 childLines.add(''); | |
| 207 | |
| 208 // Escape the code. | |
| 209 final escaped = escapeHtml(Strings.join(childLines, '\n')); | |
| 210 | |
| 211 return new Element('pre', [new Element.text('code', escaped)]); | |
| 212 } | |
| 213 } | |
| 214 | |
| 215 /// Parses horizontal rules like `---`, `_ _ _`, `* * *`, etc. | |
| 216 class HorizontalRuleSyntax extends BlockSyntax { | |
| 217 RegExp get pattern => _RE_HR; | |
| 218 | |
| 219 Node parse(BlockParser parser) { | |
| 220 final match = pattern.firstMatch(parser.current); | |
| 221 parser.advance(); | |
| 222 return new Element.empty('hr'); | |
| 223 } | |
| 224 } | |
| 225 | |
| 226 /// Parses inline HTML at the block level. This differs from other markdown | |
| 227 /// implementations in several ways: | |
| 228 /// | |
| 229 /// 1. This one is way way WAY simpler. | |
| 230 /// 2. All HTML tags at the block level will be treated as blocks. If you | |
| 231 /// start a paragraph with `<em>`, it will not wrap it in a `<p>` for you. | |
| 232 /// As soon as it sees something like HTML, it stops mucking with it until | |
| 233 /// it hits the next block. | |
| 234 /// 3. Absolutely no HTML parsing or validation is done. We're a markdown | |
| 235 /// parser not an HTML parser! | |
| 236 class BlockHtmlSyntax extends BlockSyntax { | |
| 237 RegExp get pattern => _RE_HTML; | |
| 238 | |
| 239 bool get canEndBlock => false; | |
| 240 | |
| 241 Node parse(BlockParser parser) { | |
| 242 final childLines = []; | |
| 243 | |
| 244 // Eat until we hit a blank line. | |
| 245 while (!parser.isDone && !parser.matches(_RE_EMPTY)) { | |
| 246 childLines.add(parser.current); | |
| 247 parser.advance(); | |
| 248 } | |
| 249 | |
| 250 return new Text(Strings.join(childLines, '\n')); | |
| 251 } | |
| 252 } | |
| 253 | |
| 254 class ListItem { | |
| 255 bool forceBlock = false; | |
| 256 final List<String> lines; | |
| 257 | |
| 258 ListItem(this.lines); | |
| 259 } | |
| 260 | |
| 261 /// Base class for both ordered and unordered lists. | |
| 262 class ListSyntax extends BlockSyntax { | |
| 263 bool get canEndBlock => false; | |
| 264 | |
| 265 abstract String get listTag; | |
| 266 | |
| 267 Node parse(BlockParser parser) { | |
| 268 final items = <ListItem>[]; | |
| 269 var childLines = <String>[]; | |
| 270 | |
| 271 endItem() { | |
| 272 if (childLines.length > 0) { | |
| 273 items.add(new ListItem(childLines)); | |
| 274 childLines = <String>[]; | |
| 275 } | |
| 276 } | |
| 277 | |
| 278 var match; | |
| 279 tryMatch(RegExp pattern) { | |
| 280 match = pattern.firstMatch(parser.current); | |
| 281 return match != null; | |
| 282 } | |
| 283 | |
| 284 bool afterEmpty = false; | |
| 285 while (!parser.isDone) { | |
| 286 if (tryMatch(_RE_EMPTY)) { | |
| 287 // Add a blank line to the current list item. | |
| 288 childLines.add(''); | |
| 289 } else if (tryMatch(_RE_UL) || tryMatch(_RE_OL)) { | |
| 290 // End the current list item and start a new one. | |
| 291 endItem(); | |
| 292 childLines.add(match[1]); | |
| 293 } else if (tryMatch(_RE_INDENT)) { | |
| 294 // Strip off indent and add to current item. | |
| 295 childLines.add(match[1]); | |
| 296 } else if (BlockSyntax.isAtBlockEnd(parser)) { | |
| 297 // Done with the list. | |
| 298 break; | |
| 299 } else { | |
| 300 // Anything else is paragraph text or other stuff that can be in a list | |
| 301 // item. However, if the previous item is a blank line, this means we're | |
| 302 // done with the list and are starting a new top-level paragraph. | |
| 303 if ((childLines.length > 0) && (childLines.last() == '')) break; | |
| 304 childLines.add(parser.current); | |
| 305 } | |
| 306 parser.advance(); | |
| 307 } | |
| 308 | |
| 309 endItem(); | |
| 310 | |
| 311 // Markdown, because it hates us, specifies two kinds of list items. If you | |
| 312 // have a list like: | |
| 313 // | |
| 314 // * one | |
| 315 // * two | |
| 316 // | |
| 317 // Then it will insert the conents of the lines directly in the <li>, like: | |
| 318 // <ul> | |
| 319 // <li>one</li> | |
| 320 // <li>two</li> | |
| 321 // <ul> | |
| 322 // | |
| 323 // If, however, there are blank lines between the items, each is wrapped in | |
| 324 // paragraphs: | |
| 325 // | |
| 326 // * one | |
| 327 // | |
| 328 // * two | |
| 329 // | |
| 330 // <ul> | |
| 331 // <li><p>one</p></li> | |
| 332 // <li><p>two</p></li> | |
| 333 // <ul> | |
| 334 // | |
| 335 // In other words, sometimes we parse the contents of a list item like a | |
| 336 // block, and sometimes line an inline. The rules our parser implements are: | |
| 337 // | |
| 338 // - If it has more than one line, it's a block. | |
| 339 // - If the line matches any block parser (BLOCKQUOTE, HEADER, HR, INDENT, | |
| 340 // UL, OL) it's a block. (This is for cases like "* > quote".) | |
| 341 // - If there was a blank line between this item and the previous one, it's | |
| 342 // a block. | |
| 343 // - If there was a blank line between this item and the next one, it's a | |
| 344 // block. | |
| 345 // - Otherwise, parse it as an inline. | |
| 346 | |
| 347 // Remove any trailing empty lines and note which items are separated by | |
| 348 // empty lines. Do this before seeing which items are single-line so that | |
| 349 // trailing empty lines on the last item don't force it into being a block. | |
| 350 for (int i = 0; i < items.length; i++) { | |
| 351 for (int j = items[i].lines.length - 1; j > 0; j--) { | |
| 352 if (_RE_EMPTY.firstMatch(items[i].lines[j]) != null) { | |
| 353 // Found an empty line. Item and one after it are blocks. | |
| 354 if (i < items.length - 1) { | |
| 355 items[i].forceBlock = true; | |
| 356 items[i + 1].forceBlock = true; | |
| 357 } | |
| 358 items[i].lines.removeLast(); | |
| 359 } else { | |
| 360 break; | |
| 361 } | |
| 362 } | |
| 363 } | |
| 364 | |
| 365 // Convert the list items to Nodes. | |
| 366 final itemNodes = <Node>[]; | |
| 367 for (final item in items) { | |
| 368 bool blockItem = item.forceBlock || (item.lines.length > 1); | |
| 369 | |
| 370 // See if it matches some block parser. | |
| 371 final blocksInList = const [ | |
| 372 _RE_BLOCKQUOTE, | |
| 373 _RE_HEADER, | |
| 374 _RE_HR, | |
| 375 _RE_INDENT, | |
| 376 _RE_UL, | |
| 377 _RE_OL | |
| 378 ]; | |
| 379 | |
| 380 if (!blockItem) { | |
| 381 for (final pattern in blocksInList) { | |
| 382 if (pattern.firstMatch(item.lines[0]) != null) { | |
| 383 blockItem = true; | |
| 384 break; | |
| 385 } | |
| 386 } | |
| 387 } | |
| 388 | |
| 389 // Parse the item as a block or inline. | |
| 390 if (blockItem) { | |
| 391 // Block list item. | |
| 392 final children = parser.document.parseLines(item.lines); | |
| 393 itemNodes.add(new Element('li', children)); | |
| 394 } else { | |
| 395 // Raw list item. | |
| 396 final contents = parser.document.parseInline(item.lines[0]); | |
| 397 itemNodes.add(new Element('li', contents)); | |
| 398 } | |
| 399 } | |
| 400 | |
| 401 return new Element(listTag, itemNodes); | |
| 402 } | |
| 403 } | |
| 404 | |
| 405 /// Parses unordered lists. | |
| 406 class UnorderedListSyntax extends ListSyntax { | |
| 407 RegExp get pattern => _RE_UL; | |
| 408 String get listTag => 'ul'; | |
| 409 } | |
| 410 | |
| 411 /// Parses ordered lists. | |
| 412 class OrderedListSyntax extends ListSyntax { | |
| 413 RegExp get pattern => _RE_OL; | |
| 414 String get listTag => 'ol'; | |
| 415 } | |
| 416 | |
| 417 /// Parses paragraphs of regular text. | |
| 418 class ParagraphSyntax extends BlockSyntax { | |
| 419 bool get canEndBlock => false; | |
| 420 | |
| 421 bool canParse(BlockParser parser) => true; | |
| 422 | |
| 423 Node parse(BlockParser parser) { | |
| 424 final childLines = []; | |
| 425 | |
| 426 // Eat until we hit something that ends a paragraph. | |
| 427 while (!BlockSyntax.isAtBlockEnd(parser)) { | |
| 428 childLines.add(parser.current); | |
| 429 parser.advance(); | |
| 430 } | |
| 431 | |
| 432 final contents = parser.document.parseInline( | |
| 433 Strings.join(childLines, '\n')); | |
| 434 return new Element('p', contents); | |
| 435 } | |
| 436 } | |
| OLD | NEW |