utils/markdown/block_parser.dart - Issue 8953042: Move markdown library.

Side by Side Diff: utils/markdown/block_parser.dart

Issue 8953042: Move markdown library. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Add markdown tests to dartdoc. Created 9 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.

4

5 /// The line contains only whitespace or is empty.

6 final _RE_EMPTY = const RegExp(@'^([ \t]*)$');

7

8 /// A series of `=` or `-` (on the next line) define setext-style headers.

9 final _RE_SETEXT = const RegExp(@'^((=+)\|(-+))$');

10

11 /// Leading (and trailing) `#` define atx-style headers.

12 final _RE_HEADER = const RegExp(@'^(#{1,6})(.?)#$');

13

14 /// The line starts with `>` with one optional space after.

15 final _RE_BLOCKQUOTE = const RegExp(@'^[ ]{0,3}>[ ]?(.*)$');

16

17 /// A line indented four spaces. Used for code blocks and lists.

18 final _RE_INDENT = const RegExp(@'^(?: \|\t)(.*)$');

19

20 /// Three or more hyphens, asterisks or underscores by themselves. Note that

21 /// a line like `----` is valid as both HR and SETEXT. In case of a tie,

22 /// SETEXT should win.

23 final _RE_HR = const RegExp(@'^[ ]{0,3}((-+[ ]{0,2}){3,}\|' +

24 @'(_+[ ]{0,2}){3,}\|' +

25 @'(\*+[ ]{0,2}){3,})$');

26

27 /// Really hacky way to detect block-level embedded HTML. Just looks for

28 /// "<somename".

29 final _RE_HTML = const RegExp(@'^<[ ]*\w+[ >]');

30

31 /// A line starting with one of these markers: `-`, `*`, `+`. May have up to

32 /// three leading spaces before the marker and any number of spaces or tabs

33 /// after.

34 final _RE_UL = const RegExp(@'^[ ]{0,3}[+-][ \t]+(.)$');

35

36 /// A line starting with a number like `123.`. May have up to three leading

37 /// spaces before the marker and any number of spaces or tabs after.

38 final _RE_OL = const RegExp(@'^[ ]{0,3}\d+\.[ \t]+(.*)$');

39

40 /// Maintains the internal state needed to parse a series of lines into blocks

41 /// of markdown suitable for further inline parsing.

42 class BlockParser {

43 final List<String> lines;

44

45 /// The markdown document this parser is parsing.

46 final Document document;

47

48 /// Index of the current line.

49 int pos;

50

51 BlockParser(this.lines, this.document)

52 : pos = 0;

53

54 /// Gets the current line.

55 String get current() => lines[pos];

56

57 /// Gets the line after the current one or `null` if there is none.

58 String get next() {

59 // Don't read past the end.

60 if (pos >= lines.length - 1) return null;

61 return lines[pos + 1];

62 }

63

64 void advance() {

65 pos++;

66 }

67

68 bool get isDone() => pos >= lines.length;

69

70 /// Gets whether or not the current line matches the given pattern.

71 bool matches(RegExp regex) {

72 if (isDone) return false;

73 return regex.firstMatch(current) != null;

74 }

75

76 /// Gets whether or not the current line matches the given pattern.

77 bool matchesNext(RegExp regex) {

78 if (next == null) return false;

79 return regex.firstMatch(next) != null;

80 }

81 }

82

83 class BlockSyntax {

84 /// Gets the collection of built-in block parsers. To turn a series of lines

85 /// into blocks, each of these will be tried in turn. Order matters here.

86 static List<BlockSyntax> get syntaxes() {

87 // Lazy initialize.

88 if (_syntaxes == null) {

89 _syntaxes = [

90 new EmptyBlockSyntax(),

91 new BlockHtmlSyntax(),

92 new SetextHeaderSyntax(),

93 new HeaderSyntax(),

94 new CodeBlockSyntax(),

95 new BlockquoteSyntax(),

96 new HorizontalRuleSyntax(),

97 new UnorderedListSyntax(),

98 new OrderedListSyntax(),

99 new ParagraphSyntax()

100 ];

101 }

102

103 return _syntaxes;

104 }

105

106 static List<BlockSyntax> _syntaxes;

107

108 /// Gets the regex used to identify the beginning of this block, if any.

109 RegExp get pattern() => null;

110

111 bool get canEndBlock() => true;

112

113 bool canParse(BlockParser parser) {

114 return pattern.firstMatch(parser.current) != null;

115 }

116

117 abstract Node parse(BlockParser parser);

118

119 List<String> parseChildLines(BlockParser parser) {

120 // Grab all of the lines that form the blockquote, stripping off the ">".

121 final childLines = <String>[];

122

123 while (!parser.isDone) {

124 final match = pattern.firstMatch(parser.current);

125 if (match == null) break;

126 childLines.add(match[1]);

127 parser.advance();

128 }

129

130 return childLines;

131 }

132

133 /// Gets whether or not [parser]'s current line should end the previous block.

134 static bool isAtBlockEnd(BlockParser parser) {

135 if (parser.isDone) return true;

136 return syntaxes.some((s) => s.canParse(parser) && s.canEndBlock);

137 }

138 }

139

140 class EmptyBlockSyntax extends BlockSyntax {

141 RegExp get pattern() => _RE_EMPTY;

142

143 Node parse(BlockParser parser) {

144 parser.advance();

145

146 // Don't actually emit anything.

147 return null;

148 }

149 }

150

151 /// Parses setext-style headers.

152 class SetextHeaderSyntax extends BlockSyntax {

153 bool canParse(BlockParser parser) {

154 // Note: matches next line, not the current one. We're looking for the

155 // underlining after this line.

156 return parser.matchesNext(_RE_SETEXT);

157 }

158

159 Node parse(BlockParser parser) {

160 final match = _RE_SETEXT.firstMatch(parser.next);

161

162 final tag = (match[1][0] == '=') ? 'h1' : 'h2';

163 final contents = parser.document.parseInline(parser.current);

164 parser.advance();

165 parser.advance();

166

167 return new Element(tag, contents);

168 }

169 }

170

171 /// Parses atx-style headers: `## Header ##`.

172 class HeaderSyntax extends BlockSyntax {

173 RegExp get pattern() => _RE_HEADER;

174

175 Node parse(BlockParser parser) {

176 final match = pattern.firstMatch(parser.current);

177 parser.advance();

178 final level = match[1].length;

179 final contents = parser.document.parseInline(match[2].trim());

180 return new Element('h$level', contents);

181 }

182 }

183

184 /// Parses email-style blockquotes: `> quote`.

185 class BlockquoteSyntax extends BlockSyntax {

186 RegExp get pattern() => _RE_BLOCKQUOTE;

187

188 Node parse(BlockParser parser) {

189 final childLines = parseChildLines(parser);

190

191 // Recursively parse the contents of the blockquote.

192 final children = parser.document.parseLines(childLines);

193

194 return new Element('blockquote', children);

195 }

196 }

197

198 /// Parses preformatted code blocks that are indented four spaces.

199 class CodeBlockSyntax extends BlockSyntax {

200 RegExp get pattern() => _RE_INDENT;

201

202 Node parse(BlockParser parser) {

203 final childLines = parseChildLines(parser);

204

205 // The Markdown tests expect a trailing newline.

206 childLines.add('');

207

208 // Escape the code.

209 final escaped = escapeHtml(Strings.join(childLines, '\n'));

210

211 return new Element('pre', [new Element.text('code', escaped)]);

212 }

213 }

214

215 /// Parses horizontal rules like `---`, `_ _ _`, `* * *`, etc.

216 class HorizontalRuleSyntax extends BlockSyntax {

217 RegExp get pattern() => _RE_HR;

218

219 Node parse(BlockParser parser) {

220 final match = pattern.firstMatch(parser.current);

221 parser.advance();

222 return new Element.empty('hr');

223 }

224 }

225

226 /// Parses inline HTML at the block level. This differs from other markdown

227 /// implementations in several ways:

228 ///

229 /// 1. This one is way way WAY simpler.

230 /// 2. All HTML tags at the block level will be treated as blocks. If you

231 /// start a paragraph with `<em>`, it will not wrap it in a `<p>` for you.

232 /// As soon as it sees something like HTML, it stops mucking with it until

233 /// it hits the next block.

234 /// 3. Absolutely no HTML parsing or validation is done. We're a markdown

235 /// parser not an HTML parser!

236 class BlockHtmlSyntax extends BlockSyntax {

237 RegExp get pattern() => _RE_HTML;

238

239 bool get canEndBlock() => false;

240

241 Node parse(BlockParser parser) {

242 final childLines = [];

243

244 // Eat until we hit a blank line.

245 while (!parser.isDone && !parser.matches(_RE_EMPTY)) {

246 childLines.add(parser.current);

247 parser.advance();

248 }

249

250 return new Text(Strings.join(childLines, '\n'));

251 }

252 }

253

254 class ListItem {

255 bool forceBlock = false;

256 final List<String> lines;

257

258 ListItem(this.lines);

259 }

260

261 /// Base class for both ordered and unordered lists.

262 class ListSyntax extends BlockSyntax {

263 bool get canEndBlock() => false;

264

265 abstract String get listTag();

266

267 Node parse(BlockParser parser) {

268 final items = <ListItem>[];

269 var childLines = <String>[];

270

271 endItem() {

272 if (childLines.length > 0) {

273 items.add(new ListItem(childLines));

274 childLines = <String>[];

275 }

276 }

277

278 var match;

279 tryMatch(RegExp pattern) {

280 match = pattern.firstMatch(parser.current);

281 return match != null;

282 }

283

284 bool afterEmpty = false;

285 while (!parser.isDone) {

286 if (tryMatch(_RE_EMPTY)) {

287 // Add a blank line to the current list item.

288 childLines.add('');

289 } else if (tryMatch(_RE_UL) \|\| tryMatch(_RE_OL)) {

290 // End the current list item and start a new one.

291 endItem();

292 childLines.add(match[1]);

293 } else if (tryMatch(_RE_INDENT)) {

294 // Strip off indent and add to current item.

295 childLines.add(match[1]);

296 } else if (isAtBlockEnd(parser)) {

297 // Done with the list.

298 break;

299 } else {

300 // Anything else is paragraph text or other stuff that can be in a list

301 // item. However, if the previous item is a blank line, this means we're

302 // done with the list and are starting a new top-level paragraph.

303 if ((childLines.length > 0) && (childLines.last() == '')) break;

304 childLines.add(parser.current);

305 }

306 parser.advance();

307 }

308

309 endItem();

310

311 // Markdown, because it hates us, specifies two kinds of list items. If you

312 // have a list like:

313 //

314 // * one

315 // * two

316 //

317 // Then it will insert the conents of the lines directly in the <li>, like:

318 // <ul>

319 // <li>one</li>

320 // <li>two</li>

321 // <ul>

322 //

323 // If, however, there are blank lines between the items, each is wrapped in

324 // paragraphs:

325 //

326 // * one

327 //

328 // * two

329 //

330 // <ul>

331 // <li><p>one</p></li>

332 // <li><p>two</p></li>

333 // <ul>

334 //

335 // In other words, sometimes we parse the contents of a list item like a

336 // block, and sometimes line an inline. The rules our parser implements are:

337 //

338 // - If it has more than one line, it's a block.

339 // - If the line matches any block parser (BLOCKQUOTE, HEADER, HR, INDENT,

340 // UL, OL) it's a block. (This is for cases like "* > quote".)

341 // - If there was a blank line between this item and the previous one, it's

342 // a block.

343 // - If there was a blank line between this item and the next one, it's a

344 // block.

345 // - Otherwise, parse it as an inline.

346

347 // Remove any trailing empty lines and note which items are separated by

348 // empty lines. Do this before seeing which items are single-line so that

349 // trailing empty lines on the last item don't force it into being a block.

350 for (int i = 0; i < items.length; i++) {

351 for (int j = items[i].lines.length - 1; j > 0; j--) {

352 if (_RE_EMPTY.firstMatch(items[i].lines[j]) != null) {

353 // Found an empty line. Item and one after it are blocks.

354 if (i < items.length - 1) {

355 items[i].forceBlock = true;

356 items[i + 1].forceBlock = true;

357 }

358 items[i].lines.removeLast();

359 } else {

360 break;

361 }

362 }

363 }

364

365 // Convert the list items to Nodes.

366 final itemNodes = <Node>[];

367 for (final item in items) {

368 bool blockItem = item.forceBlock \|\| (item.lines.length > 1);

369

370 // See if it matches some block parser.

371 final blocksInList = const [

372 _RE_BLOCKQUOTE,

373 _RE_HEADER,

374 _RE_HR,

375 _RE_INDENT,

376 _RE_UL,

377 _RE_OL

378 ];

379

380 if (!blockItem) {

381 for (final pattern in blocksInList) {

382 if (pattern.firstMatch(item.lines[0]) != null) {

383 blockItem = true;

384 break;

385 }

386 }

387 }

388

389 // Parse the item as a block or inline.

390 if (blockItem) {

391 // Block list item.

392 final children = parser.document.parseLines(item.lines);

393 itemNodes.add(new Element('li', children));

394 } else {

395 // Raw list item.

396 final contents = parser.document.parseInline(item.lines[0]);

397 itemNodes.add(new Element('li', contents));

398 }

399 }

400

401 return new Element(listTag, itemNodes);

402 }

403 }

404

405 /// Parses unordered lists.

406 class UnorderedListSyntax extends ListSyntax {

407 RegExp get pattern() => _RE_UL;

408 String get listTag() => 'ul';

409 }

410

411 /// Parses ordered lists.

412 class OrderedListSyntax extends ListSyntax {

413 RegExp get pattern() => _RE_OL;

414 String get listTag() => 'ol';

415 }

416

417 /// Parses paragraphs of regular text.

418 class ParagraphSyntax extends BlockSyntax {

419 bool get canEndBlock() => false;

420

421 bool canParse(BlockParser parser) => true;

422

423 Node parse(BlockParser parser) {

424 final childLines = [];

425

426 // Eat until we hit something that ends a paragraph.

427 while (!isAtBlockEnd(parser)) {

428 childLines.add(parser.current);

429 parser.advance();

430 }

431

432 final contents = parser.document.parseInline(

433 Strings.join(childLines, '\n'));

434 return new Element('p', contents);

435 }

436 }

OLD	NEW

« no previous file with comments | « utils/markdown/ast.dart ('k') | utils/markdown/html_renderer.dart » ('j') | no next file with comments »