Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | |
| 2 // for details. All rights reserved. Use of this source code is governed by a | |
| 3 // BSD-style license that can be found in the LICENSE file. | |
| 4 | |
| 5 class _Re { | |
|
Jennifer Messerly
2011/11/23 22:25:41
what is this Java doing in my Dart code? :)
Serio
Bob Nystrom
2011/11/29 02:56:29
Reorganized. I don't know what I was thinking. The
| |
| 6 /// The line contains only whitespace or is empty. | |
| 7 static final EMPTY = const RegExp(@'^([ \t]*)$'); | |
| 8 | |
| 9 /// A series of "=" or "-" (on the next line) define setext-style headers. | |
| 10 static final SETEXT = const RegExp(@'^((=+)|(-+))$'); | |
| 11 | |
| 12 /// Leading (and trailing) "#" define atx-style headers. | |
| 13 static final HEADER = const RegExp(@'^(#{1,6})(.*?)#*$'); | |
| 14 | |
| 15 /// The line starts with ">" with one optional space after. | |
| 16 static final BLOCKQUOTE = const RegExp(@'^[ ]{0,3}>[ ]?(.*)$'); | |
| 17 | |
| 18 /// A line indented four spaces. Used for code blocks and lists. | |
| 19 static final INDENT = const RegExp(@'^(?: |\t)(.*)$'); | |
| 20 | |
| 21 /// Three or more hyphens, asterisks or underscores by themselves. Note that | |
| 22 /// a line like "----" is valid as both HR and SETEXT. In case of a tie, | |
| 23 /// SETEXT should win. | |
| 24 static final HR = const RegExp(@'^[ ]{0,3}((-+[ ]{0,2}){3,}|' + | |
| 25 @'(_+[ ]{0,2}){3,}|' + | |
| 26 @'(\*+[ ]{0,2}){3,})$'); | |
| 27 | |
| 28 /// Really hacky way to detect block-level embedded HTML. Just looks for | |
| 29 /// "<somename". | |
| 30 static final HTML = const RegExp(@'^<[ ]*\w+[ >]'); | |
| 31 | |
| 32 /// A line starting with one of these markers: "-", "*", "+". May have up to | |
| 33 /// three leading spaces before the marker and any number of spaces or tabs | |
| 34 /// after. | |
| 35 static final UL = const RegExp(@'^[ ]{0,3}[*+-][ \t]+(.*)$'); | |
| 36 | |
| 37 /// A line starting with a number like "123.". May have up to three leading | |
| 38 /// spaces before the marker and any number of spaces or tabs after. | |
| 39 static final OL = const RegExp(@'^[ ]{0,3}\d+\.[ \t]+(.*)$'); | |
| 40 | |
| 41 /// These patterns when appearing in a single-line list item will force the | |
| 42 /// item to be parsed as a block. | |
| 43 static final BLOCKS_IN_LIST = const [BLOCKQUOTE, HEADER, HR, INDENT, UL, OL]; | |
| 44 } | |
| 45 | |
| 46 /// Maintains the internal state needed to parse a series of lines into blocks | |
| 47 /// of markdown suitable for further inline parsing. | |
| 48 class BlockParser { | |
| 49 final List<String> lines; | |
| 50 | |
| 51 /// The markdown document this parser is parsing. | |
| 52 final Document document; | |
| 53 | |
| 54 /// Index of the current line. | |
| 55 int pos; | |
| 56 | |
| 57 BlockParser(this.lines, this.document) | |
| 58 : pos = 0; | |
| 59 | |
| 60 /// Gets the current line. | |
| 61 String get current() => lines[pos]; | |
| 62 | |
| 63 /// Gets the line after the current one or `null` if there is none. | |
| 64 String get next() { | |
| 65 // Don't read past the end. | |
| 66 if (pos >= lines.length - 1) return null; | |
| 67 return lines[pos + 1]; | |
| 68 } | |
| 69 | |
| 70 void advance() => pos++; | |
| 71 bool get isDone() => pos >= lines.length; | |
| 72 | |
| 73 /// Gets whether or not the current line matches the given pattern. | |
| 74 bool matches(RegExp regex) { | |
| 75 if (isDone) return false; | |
| 76 return regex.firstMatch(current) != null; | |
| 77 } | |
| 78 | |
| 79 /// Gets whether or not the current line matches the given pattern. | |
| 80 bool matchesNext(RegExp regex) { | |
| 81 if (next == null) return false; | |
| 82 return regex.firstMatch(next) != null; | |
| 83 } | |
| 84 } | |
| 85 | |
| 86 class BlockSyntax { | |
| 87 /// Gets the collection of built-in block parsers. To turn a series of lines | |
| 88 /// into blocks, each of these will be tried in turn. Order matters here. | |
| 89 static List<BlockSyntax> get syntaxes() { | |
| 90 // Lazy initialize. | |
| 91 if (_syntaxes == null) { | |
| 92 _syntaxes = [ | |
| 93 new EmptyBlockSyntax(), | |
|
Jennifer Messerly
2011/11/23 22:25:41
could these be const, and then use a "static final
Bob Nystrom
2011/11/29 02:56:29
They could be, but I'm thinking users may be able
| |
| 94 new BlockHtmlSyntax(), | |
| 95 new SetextHeaderSyntax(), | |
| 96 new HeaderSyntax(), | |
| 97 new CodeBlockSyntax(), | |
| 98 new BlockquoteSyntax(), | |
| 99 new HorizontalRuleSyntax(), | |
| 100 new UnorderedListSyntax(), | |
| 101 new OrderedListSyntax(), | |
| 102 new ParagraphSyntax() | |
| 103 ]; | |
| 104 } | |
| 105 | |
| 106 return _syntaxes; | |
| 107 } | |
| 108 | |
| 109 static List<BlockSyntax> _syntaxes; | |
| 110 | |
| 111 /// Gets the regex used to identify the beginning of this block, if any. | |
| 112 RegExp get pattern() => null; | |
| 113 | |
| 114 bool get canEndBlock() => true; | |
| 115 | |
| 116 bool canParse(BlockParser parser) { | |
| 117 return pattern.firstMatch(parser.current) != null; | |
| 118 } | |
| 119 | |
| 120 abstract Node parse(BlockParser parser); | |
| 121 | |
| 122 List<Node> parseChildLines(BlockParser parser) { | |
| 123 // Grab all of the lines that form the blockquote, stripping off the ">". | |
| 124 final childLines = []; | |
| 125 | |
| 126 while (!parser.isDone) { | |
| 127 final match = pattern.firstMatch(parser.current); | |
| 128 if (match == null) break; | |
| 129 childLines.add(match.group(1)); | |
| 130 parser.advance(); | |
| 131 } | |
| 132 | |
| 133 return childLines; | |
| 134 } | |
| 135 | |
| 136 /// Gets whether or not [parser]'s current line should end the previous block. | |
| 137 static bool isAtBlockEnd(BlockParser parser) { | |
| 138 if (parser.isDone) return true; | |
| 139 return syntaxes.some((s) => s.canParse(parser) && s.canEndBlock); | |
| 140 } | |
| 141 } | |
| 142 | |
| 143 class EmptyBlockSyntax extends BlockSyntax { | |
| 144 RegExp get pattern() => _Re.EMPTY; | |
| 145 | |
| 146 Node parse(BlockParser parser) { | |
| 147 parser.advance(); | |
| 148 | |
| 149 // Don't actually emit anything. | |
| 150 return null; | |
| 151 } | |
| 152 } | |
| 153 | |
| 154 /// Parses setext-style headers. | |
| 155 class SetextHeaderSyntax extends BlockSyntax { | |
| 156 bool canParse(BlockParser parser) { | |
| 157 // Note: matches *next* line, not the current one. We're looking for the | |
| 158 // underlining after this line. | |
| 159 return parser.matchesNext(_Re.SETEXT); | |
| 160 } | |
| 161 | |
| 162 Node parse(BlockParser parser) { | |
| 163 final match = _Re.SETEXT.firstMatch(parser.next); | |
| 164 | |
| 165 final tag = (match.group(1)[0] == '=') ? 'h1' : 'h2'; | |
| 166 final contents = parser.document.parseInline(parser.current); | |
| 167 parser.advance(); | |
| 168 parser.advance(); | |
| 169 | |
| 170 return new Element(tag, contents); | |
| 171 } | |
| 172 } | |
| 173 | |
| 174 /// Parses atx-style headers: "## Header ##". | |
| 175 class HeaderSyntax extends BlockSyntax { | |
| 176 RegExp get pattern() => _Re.HEADER; | |
| 177 | |
| 178 Node parse(BlockParser parser) { | |
| 179 final match = pattern.firstMatch(parser.current); | |
| 180 parser.advance(); | |
| 181 final level = match.group(1).length; | |
| 182 final contents = parser.document.parseInline(match.group(2).trim()); | |
| 183 return new Element('h$level', contents); | |
| 184 } | |
| 185 } | |
| 186 | |
| 187 /// Parses email-style blockquotes: "> quote". | |
| 188 class BlockquoteSyntax extends BlockSyntax { | |
| 189 RegExp get pattern() => _Re.BLOCKQUOTE; | |
| 190 | |
| 191 Node parse(BlockParser parser) { | |
| 192 final childLines = parseChildLines(parser); | |
| 193 | |
| 194 // Recursively parse the contents of the blockquote. | |
| 195 final children = parser.document.parseLines(childLines); | |
| 196 | |
| 197 return new Element('blockquote', children); | |
| 198 } | |
| 199 } | |
| 200 | |
| 201 /// Parses preformatted code blocks that are indented four spaces. | |
| 202 class CodeBlockSyntax extends BlockSyntax { | |
| 203 RegExp get pattern() => _Re.INDENT; | |
| 204 | |
| 205 Node parse(BlockParser parser) { | |
| 206 final childLines = parseChildLines(parser); | |
| 207 | |
| 208 // The Markdown tests expect a trailing newline. | |
| 209 childLines.add(''); | |
| 210 | |
| 211 // Escape the code. | |
| 212 final escaped = escapeHtml(Strings.join(childLines, '\n')); | |
| 213 | |
| 214 return new Element('pre', [new Element.text('code', escaped)]); | |
| 215 } | |
| 216 } | |
| 217 | |
| 218 /// Parses horizontal rules like "---", "_ _ _", "* * *", etc. | |
| 219 class HorizontalRuleSyntax extends BlockSyntax { | |
| 220 RegExp get pattern() => _Re.HR; | |
| 221 | |
| 222 Node parse(BlockParser parser) { | |
| 223 final match = pattern.firstMatch(parser.current); | |
| 224 parser.advance(); | |
| 225 return new Element.empty('hr'); | |
| 226 } | |
| 227 } | |
| 228 | |
| 229 /// Parses inline HTML at the block level. This differs from other markdown | |
| 230 /// implementations in several ways: | |
| 231 /// | |
| 232 /// 1. This one is way way WAY simpler. | |
| 233 /// 2. All HTML tags at the block level will be treated as blocks. If you start | |
| 234 /// a paragraph with <em>, it will not wrap it in a <p> for you. As soon as | |
| 235 /// it sees something like HTML, it stops mucking with it until it hits the | |
| 236 /// next block. | |
| 237 /// 3. Absolutely no HTML parsing or validation is done. We're a markdown | |
| 238 /// parser not an HTML parser! | |
| 239 class BlockHtmlSyntax extends BlockSyntax { | |
| 240 RegExp get pattern() => _Re.HTML; | |
| 241 | |
| 242 bool get canEndBlock() => false; | |
| 243 | |
| 244 Node parse(BlockParser parser) { | |
| 245 final childLines = []; | |
| 246 | |
| 247 // Eat until we hit a blank line. | |
| 248 while (!parser.isDone && !parser.matches(_Re.EMPTY)) { | |
| 249 childLines.add(parser.current); | |
| 250 parser.advance(); | |
| 251 } | |
| 252 | |
| 253 return new Text(Strings.join(childLines, '\n')); | |
| 254 } | |
| 255 } | |
| 256 | |
| 257 class ListItem { | |
| 258 bool forceBlock = false; | |
| 259 final List<String> lines; | |
| 260 | |
| 261 ListItem(this.lines); | |
| 262 } | |
| 263 | |
| 264 /// Base class for both ordered and unordered lists. | |
| 265 class ListSyntax extends BlockSyntax { | |
| 266 bool get canEndBlock() => false; | |
| 267 | |
| 268 abstract String get listTag(); | |
| 269 | |
| 270 Node parse(BlockParser parser) { | |
| 271 final items = <ListItem>[]; | |
| 272 var childLines = <String>[]; | |
| 273 | |
| 274 endItem() { | |
| 275 if (childLines.length > 0) { | |
| 276 items.add(new ListItem(childLines)); | |
| 277 childLines = <String>[]; | |
| 278 } | |
| 279 } | |
| 280 | |
| 281 var match; | |
| 282 tryMatch(RegExp pattern) { | |
| 283 match = pattern.firstMatch(parser.current); | |
| 284 return match != null; | |
| 285 } | |
| 286 | |
| 287 bool afterEmpty = false; | |
| 288 while (!parser.isDone) { | |
| 289 if (tryMatch(_Re.EMPTY)) { | |
| 290 // Add a blank line to the current list item. | |
| 291 childLines.add(''); | |
| 292 } else if (tryMatch(_Re.UL) || tryMatch(_Re.OL)) { | |
| 293 // End the current list item and start a new one. | |
| 294 endItem(); | |
| 295 childLines.add(match.group(1)); | |
| 296 } else if (tryMatch(_Re.INDENT)) { | |
| 297 // Strip off indent and add to current item. | |
| 298 childLines.add(match.group(1)); | |
| 299 } else if (isAtBlockEnd(parser)) { | |
| 300 // Done with the list. | |
| 301 break; | |
| 302 } else { | |
| 303 // Anything else is paragraph text or other stuff that can be in a list | |
| 304 // item. However, if the previous item is a blank line, this means we're | |
| 305 // done with the list and are starting a new top-level paragraph. | |
| 306 if ((childLines.length > 0) && (childLines.last() == '')) break; | |
| 307 childLines.add(parser.current); | |
| 308 } | |
| 309 parser.advance(); | |
| 310 } | |
| 311 | |
| 312 endItem(); | |
| 313 | |
| 314 // Markdown, because it hates us, specifies two kinds of list items. If you | |
| 315 // have a list like: | |
| 316 // | |
| 317 // * one | |
| 318 // * two | |
| 319 // | |
| 320 // Then it will insert the conents of the lines directly in the <li>, like: | |
| 321 // <ul> | |
| 322 // <li>one</li> | |
| 323 // <li>two</li> | |
| 324 // <ul> | |
| 325 // | |
| 326 // If, however, there are blank lines between the items, each is wrapped in | |
| 327 // paragraphs: | |
| 328 // | |
| 329 // * one | |
| 330 // | |
| 331 // * two | |
| 332 // | |
| 333 // <ul> | |
| 334 // <li><p>one</p></li> | |
| 335 // <li><p>two</p></li> | |
| 336 // <ul> | |
| 337 // | |
| 338 // In other words, sometimes we parse the contents of a list item like a | |
| 339 // block, and sometimes line an inline. The rules our parser implements are: | |
| 340 // | |
| 341 // - If it has more than one line, it's a block. | |
| 342 // - If the line matches any block parser (BLOCKQUOTE, HEADER, HR, INDENT, | |
| 343 // UL, OL) it's a block. (This is for cases like "* > quote".) | |
| 344 // - If there was a blank line between this item and the previous one, it's | |
| 345 // a block. | |
| 346 // - If there was a blank line between this item and the next one, it's a | |
| 347 // block. | |
| 348 // - Otherwise, parse it as an inline. | |
| 349 | |
| 350 // Remove any trailing empty lines and note which items are separated by | |
| 351 // empty lines. Do this before seeing which items are single-line so that | |
| 352 // trailing empty lines on the last item don't force it into being a block. | |
| 353 for (int i = 0; i < items.length; i++) { | |
| 354 for (int j = items[i].lines.length - 1; j > 0; j--) { | |
| 355 if (_Re.EMPTY.firstMatch(items[i].lines[j]) != null) { | |
| 356 // Found an empty line. Item and one after it are blocks. | |
| 357 if (i < items.length - 1) { | |
| 358 items[i].forceBlock = true; | |
| 359 items[i + 1].forceBlock = true; | |
| 360 } | |
| 361 items[i].lines.removeLast(); | |
| 362 } else { | |
| 363 break; | |
| 364 } | |
| 365 } | |
| 366 } | |
| 367 | |
| 368 // Convert the list items to Nodes. | |
| 369 final itemNodes = <Node>[]; | |
| 370 for (final item in items) { | |
| 371 bool blockItem = item.forceBlock || (item.lines.length > 1); | |
| 372 | |
| 373 // See if it matches some block parser. | |
| 374 if (!blockItem) { | |
| 375 for (final pattern in _Re.BLOCKS_IN_LIST) { | |
| 376 if (pattern.firstMatch(item.lines[0]) != null) { | |
| 377 blockItem = true; | |
| 378 break; | |
| 379 } | |
| 380 } | |
| 381 } | |
| 382 | |
| 383 // Parse the item as a block or inline. | |
| 384 if (blockItem) { | |
| 385 // Block list item. | |
| 386 final children = parser.document.parseLines(item.lines); | |
| 387 itemNodes.add(new Element('li', children)); | |
| 388 } else { | |
| 389 // Raw list item. | |
| 390 final contents = parser.document.parseInline(item.lines[0]); | |
| 391 itemNodes.add(new Element('li', contents)); | |
| 392 } | |
| 393 } | |
| 394 | |
| 395 return new Element(listTag, itemNodes); | |
| 396 } | |
| 397 } | |
| 398 | |
| 399 /// Parses unordered lists. | |
| 400 class UnorderedListSyntax extends ListSyntax { | |
| 401 RegExp get pattern() => _Re.UL; | |
| 402 String get listTag() => 'ul'; | |
| 403 } | |
| 404 | |
| 405 /// Parses ordered lists. | |
| 406 class OrderedListSyntax extends ListSyntax { | |
| 407 RegExp get pattern() => _Re.OL; | |
| 408 String get listTag() => 'ol'; | |
| 409 } | |
| 410 | |
| 411 /// Parses paragraphs of regular text. | |
| 412 class ParagraphSyntax extends BlockSyntax { | |
| 413 bool get canEndBlock() => false; | |
| 414 | |
| 415 bool canParse(BlockParser parser) => true; | |
| 416 | |
| 417 Node parse(BlockParser parser) { | |
| 418 final childLines = []; | |
| 419 | |
| 420 // Eat until we hit something that ends a paragraph. | |
| 421 while (!isAtBlockEnd(parser)) { | |
| 422 childLines.add(parser.current); | |
| 423 parser.advance(); | |
| 424 } | |
| 425 | |
| 426 final contents = parser.document.parseInline( | |
| 427 Strings.join(childLines, '\n')); | |
| 428 return new Element('p', contents); | |
| 429 } | |
| 430 } | |
| OLD | NEW |