OLD | NEW |
(Empty) | |
| 1 # markdown is released under the BSD license |
| 2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later) |
| 3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) |
| 4 # Copyright 2004 Manfred Stienstra (the original version) |
| 5 # |
| 6 # All rights reserved. |
| 7 # |
| 8 # Redistribution and use in source and binary forms, with or without |
| 9 # modification, are permitted provided that the following conditions are met: |
| 10 # |
| 11 # * Redistributions of source code must retain the above copyright |
| 12 # notice, this list of conditions and the following disclaimer. |
| 13 # * Redistributions in binary form must reproduce the above copyright |
| 14 # notice, this list of conditions and the following disclaimer in the |
| 15 # documentation and/or other materials provided with the distribution. |
| 16 # * Neither the name of the <organization> nor the |
| 17 # names of its contributors may be used to endorse or promote products |
| 18 # derived from this software without specific prior written permission. |
| 19 # |
| 20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY |
| 21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| 22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| 23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT |
| 24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 30 # POSSIBILITY OF SUCH DAMAGE. |
| 31 |
| 32 |
| 33 """ |
| 34 PRE-PROCESSORS |
| 35 ============================================================================= |
| 36 |
| 37 Preprocessors work on source text before we start doing anything too |
| 38 complicated. |
| 39 """ |
| 40 |
| 41 from __future__ import absolute_import |
| 42 from __future__ import unicode_literals |
| 43 from . import util |
| 44 from . import odict |
| 45 import re |
| 46 |
| 47 |
| 48 def build_preprocessors(md_instance, **kwargs): |
| 49 """ Build the default set of preprocessors used by Markdown. """ |
| 50 preprocessors = odict.OrderedDict() |
| 51 preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance) |
| 52 if md_instance.safeMode != 'escape': |
| 53 preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance) |
| 54 preprocessors["reference"] = ReferencePreprocessor(md_instance) |
| 55 return preprocessors |
| 56 |
| 57 |
| 58 class Preprocessor(util.Processor): |
| 59 """ |
| 60 Preprocessors are run after the text is broken into lines. |
| 61 |
| 62 Each preprocessor implements a "run" method that takes a pointer to a |
| 63 list of lines of the document, modifies it as necessary and returns |
| 64 either the same pointer or a pointer to a new list. |
| 65 |
| 66 Preprocessors must extend markdown.Preprocessor. |
| 67 |
| 68 """ |
| 69 def run(self, lines): |
| 70 """ |
| 71 Each subclass of Preprocessor should override the `run` method, which |
| 72 takes the document as a list of strings split by newlines and returns |
| 73 the (possibly modified) list of lines. |
| 74 |
| 75 """ |
| 76 pass |
| 77 |
| 78 |
| 79 class NormalizeWhitespace(Preprocessor): |
| 80 """ Normalize whitespace for consistant parsing. """ |
| 81 |
| 82 def run(self, lines): |
| 83 source = '\n'.join(lines) |
| 84 source = source.replace(util.STX, "").replace(util.ETX, "") |
| 85 source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" |
| 86 source = source.expandtabs(self.markdown.tab_length) |
| 87 source = re.sub(r'(?<=\n) +\n', '\n', source) |
| 88 return source.split('\n') |
| 89 |
| 90 |
| 91 class HtmlBlockPreprocessor(Preprocessor): |
| 92 """Remove html blocks from the text and store them for later retrieval.""" |
| 93 |
| 94 right_tag_patterns = ["</%s>", "%s>"] |
| 95 attrs_pattern = r""" |
| 96 \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" |
| 97 | # OR |
| 98 \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value |
| 99 | # OR |
| 100 \s+(?P<attr2>[^>"'/= ]+) # attr |
| 101 """ |
| 102 left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pa
ttern |
| 103 attrs_re = re.compile(attrs_pattern, re.VERBOSE) |
| 104 left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) |
| 105 markdown_in_raw = False |
| 106 |
| 107 def _get_left_tag(self, block): |
| 108 m = self.left_tag_re.match(block) |
| 109 if m: |
| 110 tag = m.group('tag') |
| 111 raw_attrs = m.group('attrs') |
| 112 attrs = {} |
| 113 if raw_attrs: |
| 114 for ma in self.attrs_re.finditer(raw_attrs): |
| 115 if ma.group('attr'): |
| 116 if ma.group('value'): |
| 117 attrs[ma.group('attr').strip()] = ma.group('value') |
| 118 else: |
| 119 attrs[ma.group('attr').strip()] = "" |
| 120 elif ma.group('attr1'): |
| 121 if ma.group('value1'): |
| 122 attrs[ma.group('attr1').strip()] = ma.group('value1'
) |
| 123 else: |
| 124 attrs[ma.group('attr1').strip()] = "" |
| 125 elif ma.group('attr2'): |
| 126 attrs[ma.group('attr2').strip()] = "" |
| 127 return tag, len(m.group(0)), attrs |
| 128 else: |
| 129 tag = block[1:].split(">", 1)[0].lower() |
| 130 return tag, len(tag)+2, {} |
| 131 |
| 132 def _recursive_tagfind(self, ltag, rtag, start_index, block): |
| 133 while 1: |
| 134 i = block.find(rtag, start_index) |
| 135 if i == -1: |
| 136 return -1 |
| 137 j = block.find(ltag, start_index) |
| 138 # if no ltag, or rtag found before another ltag, return index |
| 139 if (j > i or j == -1): |
| 140 return i + len(rtag) |
| 141 # another ltag found before rtag, use end of ltag as starting |
| 142 # point and search again |
| 143 j = block.find('>', j) |
| 144 start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) |
| 145 if start_index == -1: |
| 146 # HTML potentially malformed- ltag has no corresponding |
| 147 # rtag |
| 148 return -1 |
| 149 |
| 150 def _get_right_tag(self, left_tag, left_index, block): |
| 151 for p in self.right_tag_patterns: |
| 152 tag = p % left_tag |
| 153 i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block
) |
| 154 if i > 2: |
| 155 return tag.lstrip("<").rstrip(">"), i |
| 156 return block.rstrip()[-left_index:-1].lower(), len(block) |
| 157 |
| 158 def _equal_tags(self, left_tag, right_tag): |
| 159 if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. |
| 160 return True |
| 161 if ("/" + left_tag) == right_tag: |
| 162 return True |
| 163 if (right_tag == "--" and left_tag == "--"): |
| 164 return True |
| 165 elif left_tag == right_tag[1:] \ |
| 166 and right_tag[0] == "/": |
| 167 return True |
| 168 else: |
| 169 return False |
| 170 |
| 171 def _is_oneliner(self, tag): |
| 172 return (tag in ['hr', 'hr/']) |
| 173 |
| 174 def run(self, lines): |
| 175 text = "\n".join(lines) |
| 176 new_blocks = [] |
| 177 text = text.rsplit("\n\n") |
| 178 items = [] |
| 179 left_tag = '' |
| 180 right_tag = '' |
| 181 in_tag = False # flag |
| 182 |
| 183 while text: |
| 184 block = text[0] |
| 185 if block.startswith("\n"): |
| 186 block = block[1:] |
| 187 text = text[1:] |
| 188 |
| 189 if block.startswith("\n"): |
| 190 block = block[1:] |
| 191 |
| 192 if not in_tag: |
| 193 if block.startswith("<") and len(block.strip()) > 1: |
| 194 |
| 195 if block[1] == "!": |
| 196 # is a comment block |
| 197 left_tag, left_index, attrs = "--", 2, {} |
| 198 else: |
| 199 left_tag, left_index, attrs = self._get_left_tag(block) |
| 200 right_tag, data_index = self._get_right_tag(left_tag, |
| 201 left_index, |
| 202 block) |
| 203 # keep checking conditions below and maybe just append |
| 204 |
| 205 if data_index < len(block) \ |
| 206 and (util.isBlockLevel(left_tag) |
| 207 or left_tag == '--'): |
| 208 text.insert(0, block[data_index:]) |
| 209 block = block[:data_index] |
| 210 |
| 211 if not (util.isBlockLevel(left_tag) \ |
| 212 or block[1] in ["!", "?", "@", "%"]): |
| 213 new_blocks.append(block) |
| 214 continue |
| 215 |
| 216 if self._is_oneliner(left_tag): |
| 217 new_blocks.append(block.strip()) |
| 218 continue |
| 219 |
| 220 if block.rstrip().endswith(">") \ |
| 221 and self._equal_tags(left_tag, right_tag): |
| 222 if self.markdown_in_raw and 'markdown' in attrs.keys(): |
| 223 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', |
| 224 '', block[:left_index]) |
| 225 end = block[-len(right_tag)-2:] |
| 226 block = block[left_index:-len(right_tag)-2] |
| 227 new_blocks.append( |
| 228 self.markdown.htmlStash.store(start)) |
| 229 new_blocks.append(block) |
| 230 new_blocks.append( |
| 231 self.markdown.htmlStash.store(end)) |
| 232 else: |
| 233 new_blocks.append( |
| 234 self.markdown.htmlStash.store(block.strip())) |
| 235 continue |
| 236 else: |
| 237 # if is block level tag and is not complete |
| 238 |
| 239 if util.isBlockLevel(left_tag) or left_tag == "--" \ |
| 240 and not block.rstrip().endswith(">"): |
| 241 items.append(block.strip()) |
| 242 in_tag = True |
| 243 else: |
| 244 new_blocks.append( |
| 245 self.markdown.htmlStash.store(block.strip())) |
| 246 |
| 247 continue |
| 248 |
| 249 new_blocks.append(block) |
| 250 |
| 251 else: |
| 252 items.append(block) |
| 253 |
| 254 right_tag, data_index = self._get_right_tag(left_tag, 0, block) |
| 255 |
| 256 if self._equal_tags(left_tag, right_tag): |
| 257 # if find closing tag |
| 258 |
| 259 if data_index < len(block): |
| 260 # we have more text after right_tag |
| 261 items[-1] = block[:data_index] |
| 262 text.insert(0, block[data_index:]) |
| 263 |
| 264 in_tag = False |
| 265 if self.markdown_in_raw and 'markdown' in attrs.keys(): |
| 266 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', |
| 267 '', items[0][:left_index]) |
| 268 items[0] = items[0][left_index:] |
| 269 end = items[-1][-len(right_tag)-2:] |
| 270 items[-1] = items[-1][:-len(right_tag)-2] |
| 271 new_blocks.append( |
| 272 self.markdown.htmlStash.store(start)) |
| 273 new_blocks.extend(items) |
| 274 new_blocks.append( |
| 275 self.markdown.htmlStash.store(end)) |
| 276 else: |
| 277 new_blocks.append( |
| 278 self.markdown.htmlStash.store('\n\n'.join(items))) |
| 279 items = [] |
| 280 |
| 281 if items: |
| 282 if self.markdown_in_raw and 'markdown' in attrs.keys(): |
| 283 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', |
| 284 '', items[0][:left_index]) |
| 285 items[0] = items[0][left_index:] |
| 286 end = items[-1][-len(right_tag)-2:] |
| 287 items[-1] = items[-1][:-len(right_tag)-2] |
| 288 new_blocks.append( |
| 289 self.markdown.htmlStash.store(start)) |
| 290 new_blocks.extend(items) |
| 291 if end.strip(): |
| 292 new_blocks.append( |
| 293 self.markdown.htmlStash.store(end)) |
| 294 else: |
| 295 new_blocks.append( |
| 296 self.markdown.htmlStash.store('\n\n'.join(items))) |
| 297 #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))
) |
| 298 new_blocks.append('\n') |
| 299 |
| 300 new_text = "\n\n".join(new_blocks) |
| 301 return new_text.split("\n") |
| 302 |
| 303 |
| 304 class ReferencePreprocessor(Preprocessor): |
| 305 """ Remove reference definitions from text and store for later use. """ |
| 306 |
| 307 TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*' |
| 308 RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DO
TALL) |
| 309 TITLE_RE = re.compile(r'^%s$' % TITLE) |
| 310 |
| 311 def run (self, lines): |
| 312 new_text = []; |
| 313 while lines: |
| 314 line = lines.pop(0) |
| 315 m = self.RE.match(line) |
| 316 if m: |
| 317 id = m.group(1).strip().lower() |
| 318 link = m.group(2).lstrip('<').rstrip('>') |
| 319 t = m.group(5) or m.group(6) or m.group(7) |
| 320 if not t: |
| 321 # Check next line for title |
| 322 tm = self.TITLE_RE.match(lines[0]) |
| 323 if tm: |
| 324 lines.pop(0) |
| 325 t = tm.group(2) or tm.group(3) or tm.group(4) |
| 326 self.markdown.references[id] = (link, t) |
| 327 else: |
| 328 new_text.append(line) |
| 329 |
| 330 return new_text #+ "\n" |
OLD | NEW |