OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/python |
| 2 import re, md5, sys, string |
| 3 |
| 4 """markdown.py: A Markdown-styled-text to HTML converter in Python. |
| 5 |
| 6 Usage: |
| 7 ./markdown.py textfile.markdown |
| 8 |
| 9 Calling: |
| 10 import markdown |
| 11 somehtml = markdown.markdown(sometext) |
| 12 |
| 13 For other versions of markdown, see: |
| 14 http://www.freewisdom.org/projects/python-markdown/ |
| 15 http://en.wikipedia.org/wiki/Markdown |
| 16 """ |
| 17 |
| 18 __version__ = '1.0.1-2' # port of 1.0.1 |
| 19 __license__ = "GNU GPL 2" |
| 20 __author__ = [ |
| 21 'John Gruber <http://daringfireball.net/>', |
| 22 'Tollef Fog Heen <tfheen@err.no>', |
| 23 'Aaron Swartz <me@aaronsw.com>' |
| 24 ] |
| 25 |
| 26 def htmlquote(text): |
| 27 """Encodes `text` for raw use in HTML.""" |
| 28 text = text.replace("&", "&") # Must be done first! |
| 29 text = text.replace("<", "<") |
| 30 text = text.replace(">", ">") |
| 31 text = text.replace("'", "'") |
| 32 text = text.replace('"', """) |
| 33 return text |
| 34 |
| 35 def semirandom(seed): |
| 36 x = 0 |
| 37 for c in md5.new(seed).digest(): x += ord(c) |
| 38 return x / (255*16.) |
| 39 |
| 40 class _Markdown: |
| 41 emptyelt = " />" |
| 42 tabwidth = 4 |
| 43 |
| 44 escapechars = '\\`*_{}[]()>#+-.!' |
| 45 escapetable = {} |
| 46 for char in escapechars: |
| 47 escapetable[char] = md5.new(char).hexdigest() |
| 48 |
| 49 r_multiline = re.compile("\n{2,}") |
| 50 r_stripspace = re.compile(r"^[ \t]+$", re.MULTILINE) |
| 51 def parse(self, text): |
| 52 self.urls = {} |
| 53 self.titles = {} |
| 54 self.html_blocks = {} |
| 55 self.list_level = 0 |
| 56 |
| 57 text = text.replace("\r\n", "\n") |
| 58 text = text.replace("\r", "\n") |
| 59 text += "\n\n" |
| 60 text = self._Detab(text) |
| 61 text = self.r_stripspace.sub("", text) |
| 62 text = self._HashHTMLBlocks(text) |
| 63 text = self._StripLinkDefinitions(text) |
| 64 text = self._RunBlockGamut(text) |
| 65 text = self._UnescapeSpecialChars(text) |
| 66 return text |
| 67 |
| 68 r_StripLinkDefinitions = re.compile(r""" |
| 69 ^[ ]{0,%d}\[(.+)\]: # id = $1 |
| 70 [ \t]*\n?[ \t]* |
| 71 <?(\S+?)>? # url = $2 |
| 72 [ \t]*\n?[ \t]* |
| 73 (?: |
| 74 (?<=\s) # lookbehind for whitespace |
| 75 [\"\(] # " is backlashed so it colorizes our code right |
| 76 (.+?) # title = $3 |
| 77 [\"\)] |
| 78 [ \t]* |
| 79 )? # title is optional |
| 80 (?:\n+|\Z) |
| 81 """ % (tabwidth-1), re.MULTILINE|re.VERBOSE) |
| 82 def _StripLinkDefinitions(self, text): |
| 83 def replacefunc(matchobj): |
| 84 (t1, t2, t3) = matchobj.groups() |
| 85 #@@ case sensitivity? |
| 86 self.urls[t1.lower()] = self._EncodeAmpsAndAngles(t2) |
| 87 if t3 is not None: |
| 88 self.titles[t1.lower()] = t3.replace('"', '"') |
| 89 return "" |
| 90 |
| 91 text = self.r_StripLinkDefinitions.sub(replacefunc, text) |
| 92 return text |
| 93 |
| 94 blocktagsb = r"p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|math" |
| 95 blocktagsa = blocktagsb + "|ins|del" |
| 96 |
| 97 r_HashHTMLBlocks1 = re.compile(r""" |
| 98 ( # save in $1 |
| 99 ^ # start of line (with /m) |
| 100 <(%s) # start tag = $2 |
| 101 \b # word break |
| 102 (.*\n)*? # any number of lines, minimally matching |
| 103 </\2> # the matching end tag |
| 104 [ \t]* # trailing spaces/tabs |
| 105 (?=\n+|$) # followed by a newline or end of document |
| 106 ) |
| 107 """ % blocktagsa, re.MULTILINE | re.VERBOSE) |
| 108 |
| 109 r_HashHTMLBlocks2 = re.compile(r""" |
| 110 ( # save in $1 |
| 111 ^ # start of line (with /m) |
| 112 <(%s) # start tag = $2 |
| 113 \b # word break |
| 114 (.*\n)*? # any number of lines, minimally matching |
| 115 .*</\2> # the matching end tag |
| 116 [ \t]* # trailing spaces/tabs |
| 117 (?=\n+|\Z) # followed by a newline or end of document |
| 118 ) |
| 119 """ % blocktagsb, re.MULTILINE | re.VERBOSE) |
| 120 |
| 121 r_HashHR = re.compile(r""" |
| 122 (?: |
| 123 (?<=\n\n) # Starting after a blank line |
| 124 | # or |
| 125 \A\n? # the beginning of the doc |
| 126 ) |
| 127 ( # save in $1 |
| 128 [ ]{0,%d} |
| 129 <(hr) # start tag = $2 |
| 130 \b # word break |
| 131 ([^<>])*? # |
| 132 /?> # the matching end tag |
| 133 [ \t]* |
| 134 (?=\n{2,}|\Z)# followed by a blank line or end of document |
| 135 ) |
| 136 """ % (tabwidth-1), re.VERBOSE) |
| 137 r_HashComment = re.compile(r""" |
| 138 (?: |
| 139 (?<=\n\n) # Starting after a blank line |
| 140 | # or |
| 141 \A\n? # the beginning of the doc |
| 142 ) |
| 143 ( # save in $1 |
| 144 [ ]{0,%d} |
| 145 (?: |
| 146 <! |
| 147 (--.*?--\s*)+ |
| 148 > |
| 149 ) |
| 150 [ \t]* |
| 151 (?=\n{2,}|\Z)# followed by a blank line or end of document |
| 152 ) |
| 153 """ % (tabwidth-1), re.VERBOSE) |
| 154 |
| 155 def _HashHTMLBlocks(self, text): |
| 156 def handler(m): |
| 157 key = md5.new(m.group(1)).hexdigest() |
| 158 self.html_blocks[key] = m.group(1) |
| 159 return "\n\n%s\n\n" % key |
| 160 |
| 161 text = self.r_HashHTMLBlocks1.sub(handler, text) |
| 162 text = self.r_HashHTMLBlocks2.sub(handler, text) |
| 163 oldtext = text |
| 164 text = self.r_HashHR.sub(handler, text) |
| 165 text = self.r_HashComment.sub(handler, text) |
| 166 return text |
| 167 |
| 168 #@@@ wrong! |
| 169 r_hr1 = re.compile(r'^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$', re.M) |
| 170 r_hr2 = re.compile(r'^[ ]{0,2}([ ]?-[ ]?){3,}[ \t]*$', re.M) |
| 171 r_hr3 = re.compile(r'^[ ]{0,2}([ ]?_[ ]?){3,}[ \t]*$', re.M) |
| 172 |
| 173 def _RunBlockGamut(self, text): |
| 174 text = self._DoHeaders(text) |
| 175 for x in [self.r_hr1, self.r_hr2, self.r_hr3]: |
| 176 text = x.sub("\n<hr%s\n" % self.emptyelt, text); |
| 177 text = self._DoLists(text) |
| 178 text = self._DoCodeBlocks(text) |
| 179 text = self._DoBlockQuotes(text) |
| 180 |
| 181 # We did this in parse() |
| 182 # to escape the source |
| 183 # now it's stuff _we_ made |
| 184 # so we don't wrap it in <p>s. |
| 185 text = self._HashHTMLBlocks(text) |
| 186 text = self._FormParagraphs(text) |
| 187 return text |
| 188 |
| 189 r_NewLine = re.compile(" {2,}\n") |
| 190 def _RunSpanGamut(self, text): |
| 191 text = self._DoCodeSpans(text) |
| 192 text = self._EscapeSpecialChars(text) |
| 193 text = self._DoImages(text) |
| 194 text = self._DoAnchors(text) |
| 195 text = self._DoAutoLinks(text) |
| 196 text = self._EncodeAmpsAndAngles(text) |
| 197 text = self._DoItalicsAndBold(text) |
| 198 text = self.r_NewLine.sub(" <br%s\n" % self.emptyelt, text) |
| 199 return text |
| 200 |
| 201 def _EscapeSpecialChars(self, text): |
| 202 tokens = self._TokenizeHTML(text) |
| 203 text = "" |
| 204 for cur_token in tokens: |
| 205 if cur_token[0] == "tag": |
| 206 cur_token[1] = cur_token[1].replace('*', self.escapetable["*"]) |
| 207 cur_token[1] = cur_token[1].replace('_', self.escapetable["_"]) |
| 208 text += cur_token[1] |
| 209 else: |
| 210 text += self._EncodeBackslashEscapes(cur_token[1]) |
| 211 return text |
| 212 |
| 213 r_DoAnchors1 = re.compile( |
| 214 r""" ( # wrap whole match in $1 |
| 215 \[ |
| 216 (.*?) # link text = $2 |
| 217 # [for bracket nesting, see below] |
| 218 \] |
| 219 |
| 220 [ ]? # one optional space |
| 221 (?:\n[ ]*)? # one optional newline followed by spaces |
| 222 |
| 223 \[ |
| 224 (.*?) # id = $3 |
| 225 \] |
| 226 ) |
| 227 """, re.S|re.VERBOSE) |
| 228 r_DoAnchors2 = re.compile( |
| 229 r""" ( # wrap whole match in $1 |
| 230 \[ |
| 231 (.*?) # link text = $2 |
| 232 \] |
| 233 \( # literal paren |
| 234 [ \t]* |
| 235 <?(.+?)>? # href = $3 |
| 236 [ \t]* |
| 237 ( # $4 |
| 238 ([\'\"]) # quote char = $5 |
| 239 (.*?) # Title = $6 |
| 240 \5 # matching quote |
| 241 )? # title is optional |
| 242 \) |
| 243 ) |
| 244 """, re.S|re.VERBOSE) |
| 245 def _DoAnchors(self, text): |
| 246 # We here don't do the same as the perl version, as python's regex |
| 247 # engine gives us no way to match brackets. |
| 248 |
| 249 def handler1(m): |
| 250 whole_match = m.group(1) |
| 251 link_text = m.group(2) |
| 252 link_id = m.group(3).lower() |
| 253 if not link_id: link_id = link_text.lower() |
| 254 title = self.titles.get(link_id, None) |
| 255 |
| 256 |
| 257 if self.urls.has_key(link_id): |
| 258 url = self.urls[link_id] |
| 259 url = url.replace("*", self.escapetable["*"]) |
| 260 url = url.replace("_", self.escapetable["_"]) |
| 261 res = '<a href="%s"' % htmlquote(url) |
| 262 |
| 263 if title: |
| 264 title = title.replace("*", self.escapetable["*"]) |
| 265 title = title.replace("_", self.escapetable["_"]) |
| 266 res += ' title="%s"' % htmlquote(title) |
| 267 res += ">%s</a>" % htmlquote(link_text) |
| 268 else: |
| 269 res = whole_match |
| 270 return res |
| 271 |
| 272 def handler2(m): |
| 273 whole_match = m.group(1) |
| 274 link_text = m.group(2) |
| 275 url = m.group(3) |
| 276 title = m.group(6) |
| 277 |
| 278 url = url.replace("*", self.escapetable["*"]) |
| 279 url = url.replace("_", self.escapetable["_"]) |
| 280 res = '''<a href="%s"''' % htmlquote(url) |
| 281 |
| 282 if title: |
| 283 title = title.replace('"', '"') |
| 284 title = title.replace("*", self.escapetable["*"]) |
| 285 title = title.replace("_", self.escapetable["_"]) |
| 286 res += ' title="%s"' % htmlquote(title) |
| 287 res += ">%s</a>" % htmlquote(link_text) |
| 288 return res |
| 289 |
| 290 text = self.r_DoAnchors1.sub(handler1, text) |
| 291 text = self.r_DoAnchors2.sub(handler2, text) |
| 292 return text |
| 293 |
| 294 r_DoImages1 = re.compile( |
| 295 r""" ( # wrap whole match in $1 |
| 296 !\[ |
| 297 (.*?) # alt text = $2 |
| 298 \] |
| 299 |
| 300 [ ]? # one optional space |
| 301 (?:\n[ ]*)? # one optional newline followed by space
s |
| 302 |
| 303 \[ |
| 304 (.*?) # id = $3 |
| 305 \] |
| 306 |
| 307 ) |
| 308 """, re.VERBOSE|re.S) |
| 309 |
| 310 r_DoImages2 = re.compile( |
| 311 r""" ( # wrap whole match in $1 |
| 312 !\[ |
| 313 (.*?) # alt text = $2 |
| 314 \] |
| 315 \( # literal paren |
| 316 [ \t]* |
| 317 <?(\S+?)>? # src url = $3 |
| 318 [ \t]* |
| 319 ( # $4 |
| 320 ([\'\"]) # quote char = $5 |
| 321 (.*?) # title = $6 |
| 322 \5 # matching quote |
| 323 [ \t]* |
| 324 )? # title is optional |
| 325 \) |
| 326 ) |
| 327 """, re.VERBOSE|re.S) |
| 328 |
| 329 def _DoImages(self, text): |
| 330 def handler1(m): |
| 331 whole_match = m.group(1) |
| 332 alt_text = m.group(2) |
| 333 link_id = m.group(3).lower() |
| 334 |
| 335 if not link_id: |
| 336 link_id = alt_text.lower() |
| 337 |
| 338 alt_text = alt_text.replace('"', """) |
| 339 if self.urls.has_key(link_id): |
| 340 url = self.urls[link_id] |
| 341 url = url.replace("*", self.escapetable["*"]) |
| 342 url = url.replace("_", self.escapetable["_"]) |
| 343 res = '''<img src="%s" alt="%s"''' % (htmlquote(url), htmlquote(
alt_text)) |
| 344 if self.titles.has_key(link_id): |
| 345 title = self.titles[link_id] |
| 346 title = title.replace("*", self.escapetable["*"]) |
| 347 title = title.replace("_", self.escapetable["_"]) |
| 348 res += ' title="%s"' % htmlquote(title) |
| 349 res += self.emptyelt |
| 350 else: |
| 351 res = whole_match |
| 352 return res |
| 353 |
| 354 def handler2(m): |
| 355 whole_match = m.group(1) |
| 356 alt_text = m.group(2) |
| 357 url = m.group(3) |
| 358 title = m.group(6) or '' |
| 359 |
| 360 alt_text = alt_text.replace('"', """) |
| 361 title = title.replace('"', """) |
| 362 url = url.replace("*", self.escapetable["*"]) |
| 363 url = url.replace("_", self.escapetable["_"]) |
| 364 res = '<img src="%s" alt="%s"' % (htmlquote(url), htmlquote(alt_text
)) |
| 365 if title is not None: |
| 366 title = title.replace("*", self.escapetable["*"]) |
| 367 title = title.replace("_", self.escapetable["_"]) |
| 368 res += ' title="%s"' % htmlquote(title) |
| 369 res += self.emptyelt |
| 370 return res |
| 371 |
| 372 text = self.r_DoImages1.sub(handler1, text) |
| 373 text = self.r_DoImages2.sub(handler2, text) |
| 374 return text |
| 375 |
| 376 r_DoHeaders = re.compile(r"^(\#{1,6})[ \t]*(.+?)[ \t]*\#*\n+", re.VERBOSE|re
.M) |
| 377 def _DoHeaders(self, text): |
| 378 def findheader(text, c, n): |
| 379 textl = text.split('\n') |
| 380 for i in xrange(len(textl)): |
| 381 if i >= len(textl): continue |
| 382 count = textl[i].strip().count(c) |
| 383 if count > 0 and count == len(textl[i].strip()) and textl[i+1].s
trip() == '' and textl[i-1].strip() != '': |
| 384 textl = textl[:i] + textl[i+1:] |
| 385 textl[i-1] = '<h'+n+'>'+self._RunSpanGamut(textl[i-1])+'</h'
+n+'>' |
| 386 textl = textl[:i] + textl[i+1:] |
| 387 text = '\n'.join(textl) |
| 388 return text |
| 389 |
| 390 def handler(m): |
| 391 level = len(m.group(1)) |
| 392 header = self._RunSpanGamut(m.group(2)) |
| 393 return "<h%s>%s</h%s>\n\n" % (level, header, level) |
| 394 |
| 395 text = findheader(text, '=', '1') |
| 396 text = findheader(text, '-', '2') |
| 397 text = self.r_DoHeaders.sub(handler, text) |
| 398 return text |
| 399 |
| 400 rt_l = r""" |
| 401 ( |
| 402 ( |
| 403 [ ]{0,%d} |
| 404 ([*+-]|\d+[.]) |
| 405 [ \t]+ |
| 406 ) |
| 407 (?:.+?) |
| 408 ( |
| 409 \Z |
| 410 | |
| 411 \n{2,} |
| 412 (?=\S) |
| 413 (?![ \t]* ([*+-]|\d+[.])[ \t]+) |
| 414 ) |
| 415 ) |
| 416 """ % (tabwidth - 1) |
| 417 r_DoLists = re.compile('^'+rt_l, re.M | re.VERBOSE | re.S) |
| 418 r_DoListsTop = re.compile( |
| 419 r'(?:\A\n?|(?<=\n\n))'+rt_l, re.M | re.VERBOSE | re.S) |
| 420 |
| 421 def _DoLists(self, text): |
| 422 def handler(m): |
| 423 list_type = "ol" |
| 424 if m.group(3) in [ "*", "-", "+" ]: |
| 425 list_type = "ul" |
| 426 listn = m.group(1) |
| 427 listn = self.r_multiline.sub("\n\n\n", listn) |
| 428 res = self._ProcessListItems(listn) |
| 429 res = "<%s>\n%s</%s>\n" % (list_type, res, list_type) |
| 430 return res |
| 431 |
| 432 if self.list_level: |
| 433 text = self.r_DoLists.sub(handler, text) |
| 434 else: |
| 435 text = self.r_DoListsTop.sub(handler, text) |
| 436 return text |
| 437 |
| 438 r_multiend = re.compile(r"\n{2,}\Z") |
| 439 r_ProcessListItems = re.compile(r""" |
| 440 (\n)? # leading line = $1 |
| 441 (^[ \t]*) # leading whitespace = $2 |
| 442 ([*+-]|\d+[.]) [ \t]+ # list marker = $3 |
| 443 ((?:.+?) # list item text = $4 |
| 444 (\n{1,2})) |
| 445 (?= \n* (\Z | \2 ([*+-]|\d+[.]) [ \t]+)) |
| 446 """, re.VERBOSE | re.M | re.S) |
| 447 |
| 448 def _ProcessListItems(self, text): |
| 449 self.list_level += 1 |
| 450 text = self.r_multiend.sub("\n", text) |
| 451 |
| 452 def handler(m): |
| 453 item = m.group(4) |
| 454 leading_line = m.group(1) |
| 455 leading_space = m.group(2) |
| 456 |
| 457 if leading_line or self.r_multiline.search(item): |
| 458 item = self._RunBlockGamut(self._Outdent(item)) |
| 459 else: |
| 460 item = self._DoLists(self._Outdent(item)) |
| 461 if item[-1] == "\n": item = item[:-1] # chomp |
| 462 item = self._RunSpanGamut(item) |
| 463 return "<li>%s</li>\n" % item |
| 464 |
| 465 text = self.r_ProcessListItems.sub(handler, text) |
| 466 self.list_level -= 1 |
| 467 return text |
| 468 |
| 469 r_DoCodeBlocks = re.compile(r""" |
| 470 (?:\n\n|\A) |
| 471 ( # $1 = the code block |
| 472 (?: |
| 473 (?:[ ]{%d} | \t) # Lines must start with a tab or equiv |
| 474 .*\n+ |
| 475 )+ |
| 476 ) |
| 477 ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space/end of doc |
| 478 """ % (tabwidth, tabwidth), re.M | re.VERBOSE) |
| 479 def _DoCodeBlocks(self, text): |
| 480 def handler(m): |
| 481 codeblock = m.group(1) |
| 482 codeblock = self._EncodeCode(self._Outdent(codeblock)) |
| 483 codeblock = self._Detab(codeblock) |
| 484 codeblock = codeblock.lstrip("\n") |
| 485 codeblock = codeblock.rstrip() |
| 486 res = "\n\n<pre><code>%s\n</code></pre>\n\n" % codeblock |
| 487 return res |
| 488 |
| 489 text = self.r_DoCodeBlocks.sub(handler, text) |
| 490 return text |
| 491 r_DoCodeSpans = re.compile(r""" |
| 492 (`+) # $1 = Opening run of ` |
| 493 (.+?) # $2 = The code block |
| 494 (?<!`) |
| 495 \1 # Matching closer |
| 496 (?!`) |
| 497 """, re.I|re.VERBOSE) |
| 498 def _DoCodeSpans(self, text): |
| 499 def handler(m): |
| 500 c = m.group(2) |
| 501 c = c.strip() |
| 502 c = self._EncodeCode(c) |
| 503 return "<code>%s</code>" % c |
| 504 |
| 505 text = self.r_DoCodeSpans.sub(handler, text) |
| 506 return text |
| 507 |
| 508 def _EncodeCode(self, text): |
| 509 text = text.replace("&","&") |
| 510 text = text.replace("<","<") |
| 511 text = text.replace(">",">") |
| 512 for c in "*_{}[]\\": |
| 513 text = text.replace(c, self.escapetable[c]) |
| 514 return text |
| 515 |
| 516 |
| 517 r_DoBold = re.compile(r"(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1", re.VERBOSE
| re.S) |
| 518 r_DoItalics = re.compile(r"(\*|_) (?=\S) (.+?) (?<=\S) \1", re.VERBOSE | re.
S) |
| 519 def _DoItalicsAndBold(self, text): |
| 520 text = self.r_DoBold.sub(r"<strong>\2</strong>", text) |
| 521 text = self.r_DoItalics.sub(r"<em>\2</em>", text) |
| 522 return text |
| 523 |
| 524 r_start = re.compile(r"^", re.M) |
| 525 r_DoBlockQuotes1 = re.compile(r"^[ \t]*>[ \t]?", re.M) |
| 526 r_DoBlockQuotes2 = re.compile(r"^[ \t]+$", re.M) |
| 527 r_DoBlockQuotes3 = re.compile(r""" |
| 528 ( # Wrap whole match in $1 |
| 529 ( |
| 530 ^[ \t]*>[ \t]? # '>' at the start of a line |
| 531 .+\n # rest of the first line |
| 532 (.+\n)* # subsequent consecutive lines |
| 533 \n* # blanks |
| 534 )+ |
| 535 )""", re.M | re.VERBOSE) |
| 536 r_protectpre = re.compile(r'(\s*<pre>.+?</pre>)', re.S) |
| 537 r_propre = re.compile(r'^ ', re.M) |
| 538 |
| 539 def _DoBlockQuotes(self, text): |
| 540 def prehandler(m): |
| 541 return self.r_propre.sub('', m.group(1)) |
| 542 |
| 543 def handler(m): |
| 544 bq = m.group(1) |
| 545 bq = self.r_DoBlockQuotes1.sub("", bq) |
| 546 bq = self.r_DoBlockQuotes2.sub("", bq) |
| 547 bq = self._RunBlockGamut(bq) |
| 548 bq = self.r_start.sub(" ", bq) |
| 549 bq = self.r_protectpre.sub(prehandler, bq) |
| 550 return "<blockquote>\n%s\n</blockquote>\n\n" % bq |
| 551 |
| 552 text = self.r_DoBlockQuotes3.sub(handler, text) |
| 553 return text |
| 554 |
| 555 r_tabbed = re.compile(r"^([ \t]*)") |
| 556 def _FormParagraphs(self, text): |
| 557 text = text.strip("\n") |
| 558 grafs = self.r_multiline.split(text) |
| 559 |
| 560 for g in xrange(len(grafs)): |
| 561 t = grafs[g].strip() #@@? |
| 562 if not self.html_blocks.has_key(t): |
| 563 t = self._RunSpanGamut(t) |
| 564 t = self.r_tabbed.sub(r"<p>", t) |
| 565 t += "</p>" |
| 566 grafs[g] = t |
| 567 |
| 568 for g in xrange(len(grafs)): |
| 569 t = grafs[g].strip() |
| 570 if self.html_blocks.has_key(t): |
| 571 grafs[g] = self.html_blocks[t] |
| 572 |
| 573 return "\n\n".join(grafs) |
| 574 |
| 575 r_EncodeAmps = re.compile(r"&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)") |
| 576 r_EncodeAngles = re.compile(r"<(?![a-z/?\$!])") |
| 577 def _EncodeAmpsAndAngles(self, text): |
| 578 text = self.r_EncodeAmps.sub("&", text) |
| 579 text = self.r_EncodeAngles.sub("<", text) |
| 580 return text |
| 581 |
| 582 def _EncodeBackslashEscapes(self, text): |
| 583 for char in self.escapechars: |
| 584 text = text.replace("\\" + char, self.escapetable[char]) |
| 585 return text |
| 586 |
| 587 r_link = re.compile(r"<((https?|ftp):[^\'\">\s]+)>", re.I) |
| 588 r_email = re.compile(r""" |
| 589 < |
| 590 (?:mailto:)? |
| 591 ( |
| 592 [-.\w]+ |
| 593 \@ |
| 594 [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+ |
| 595 ) |
| 596 >""", re.VERBOSE|re.I) |
| 597 def _DoAutoLinks(self, text): |
| 598 text = self.r_link.sub(r'<a href="\1">\1</a>', text) |
| 599 |
| 600 def handler(m): |
| 601 l = m.group(1) |
| 602 return self._EncodeEmailAddress(self._UnescapeSpecialChars(l)) |
| 603 |
| 604 text = self.r_email.sub(handler, text) |
| 605 return text |
| 606 |
| 607 r_EncodeEmailAddress = re.compile(r">.+?:") |
| 608 def _EncodeEmailAddress(self, text): |
| 609 encode = [ |
| 610 lambda x: "&#%s;" % ord(x), |
| 611 lambda x: "&#x%X;" % ord(x), |
| 612 lambda x: x |
| 613 ] |
| 614 |
| 615 text = "mailto:" + text |
| 616 addr = "" |
| 617 for c in text: |
| 618 if c == ':': addr += c; continue |
| 619 |
| 620 r = semirandom(addr) |
| 621 if r < 0.45: |
| 622 addr += encode[1](c) |
| 623 elif r > 0.9 and c != '@': |
| 624 addr += encode[2](c) |
| 625 else: |
| 626 addr += encode[0](c) |
| 627 |
| 628 text = '<a href="%s">%s</a>' % (addr, addr) |
| 629 text = self.r_EncodeEmailAddress.sub('>', text) |
| 630 return text |
| 631 |
| 632 def _UnescapeSpecialChars(self, text): |
| 633 for key in self.escapetable.keys(): |
| 634 text = text.replace(self.escapetable[key], key) |
| 635 return text |
| 636 |
| 637 tokenize_depth = 6 |
| 638 tokenize_nested_tags = '|'.join([r'(?:<[a-z/!$](?:[^<>]'] * tokenize_depth)
+ (')*>)' * tokenize_depth) |
| 639 r_TokenizeHTML = re.compile( |
| 640 r"""(?: <! ( -- .*? -- \s* )+ > ) | # comment |
| 641 (?: <\? .*? \?> ) | # processing instruction |
| 642 %s # nested tags |
| 643 """ % tokenize_nested_tags, re.I|re.VERBOSE) |
| 644 def _TokenizeHTML(self, text): |
| 645 pos = 0 |
| 646 tokens = [] |
| 647 matchobj = self.r_TokenizeHTML.search(text, pos) |
| 648 while matchobj: |
| 649 whole_tag = matchobj.string[matchobj.start():matchobj.end()] |
| 650 sec_start = matchobj.end() |
| 651 tag_start = sec_start - len(whole_tag) |
| 652 if pos < tag_start: |
| 653 tokens.append(["text", matchobj.string[pos:tag_start]]) |
| 654 |
| 655 tokens.append(["tag", whole_tag]) |
| 656 pos = sec_start |
| 657 matchobj = self.r_TokenizeHTML.search(text, pos) |
| 658 |
| 659 if pos < len(text): |
| 660 tokens.append(["text", text[pos:]]) |
| 661 return tokens |
| 662 |
| 663 r_Outdent = re.compile(r"""^(\t|[ ]{1,%d})""" % tabwidth, re.M) |
| 664 def _Outdent(self, text): |
| 665 text = self.r_Outdent.sub("", text) |
| 666 return text |
| 667 |
| 668 def _Detab(self, text): return text.expandtabs(self.tabwidth) |
| 669 |
| 670 def Markdown(*args, **kw): return _Markdown().parse(*args, **kw) |
| 671 markdown = Markdown |
| 672 |
| 673 if __name__ == '__main__': |
| 674 if len(sys.argv) > 1: |
| 675 print Markdown(open(sys.argv[1]).read()) |
| 676 else: |
| 677 print Markdown(sys.stdin.read()) |
OLD | NEW |