| OLD | NEW |
| (Empty) |
| 1 """ | |
| 2 PRE-PROCESSORS | |
| 3 ============================================================================= | |
| 4 | |
| 5 Preprocessors work on source text before we start doing anything too | |
| 6 complicated. | |
| 7 """ | |
| 8 | |
| 9 from __future__ import absolute_import | |
| 10 from __future__ import unicode_literals | |
| 11 from . import util | |
| 12 from . import odict | |
| 13 import re | |
| 14 | |
| 15 | |
| 16 def build_preprocessors(md_instance, **kwargs): | |
| 17 """ Build the default set of preprocessors used by Markdown. """ | |
| 18 preprocessors = odict.OrderedDict() | |
| 19 preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance) | |
| 20 if md_instance.safeMode != 'escape': | |
| 21 preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance) | |
| 22 preprocessors["reference"] = ReferencePreprocessor(md_instance) | |
| 23 return preprocessors | |
| 24 | |
| 25 | |
| 26 class Preprocessor(util.Processor): | |
| 27 """ | |
| 28 Preprocessors are run after the text is broken into lines. | |
| 29 | |
| 30 Each preprocessor implements a "run" method that takes a pointer to a | |
| 31 list of lines of the document, modifies it as necessary and returns | |
| 32 either the same pointer or a pointer to a new list. | |
| 33 | |
| 34 Preprocessors must extend markdown.Preprocessor. | |
| 35 | |
| 36 """ | |
| 37 def run(self, lines): | |
| 38 """ | |
| 39 Each subclass of Preprocessor should override the `run` method, which | |
| 40 takes the document as a list of strings split by newlines and returns | |
| 41 the (possibly modified) list of lines. | |
| 42 | |
| 43 """ | |
| 44 pass # pragma: no cover | |
| 45 | |
| 46 | |
| 47 class NormalizeWhitespace(Preprocessor): | |
| 48 """ Normalize whitespace for consistant parsing. """ | |
| 49 | |
| 50 def run(self, lines): | |
| 51 source = '\n'.join(lines) | |
| 52 source = source.replace(util.STX, "").replace(util.ETX, "") | |
| 53 source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" | |
| 54 source = source.expandtabs(self.markdown.tab_length) | |
| 55 source = re.sub(r'(?<=\n) +\n', '\n', source) | |
| 56 return source.split('\n') | |
| 57 | |
| 58 | |
| 59 class HtmlBlockPreprocessor(Preprocessor): | |
| 60 """Remove html blocks from the text and store them for later retrieval.""" | |
| 61 | |
| 62 right_tag_patterns = ["</%s>", "%s>"] | |
| 63 attrs_pattern = r""" | |
| 64 \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" | |
| 65 | # OR | |
| 66 \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value | |
| 67 | # OR | |
| 68 \s+(?P<attr2>[^>"'/= ]+) # attr | |
| 69 """ | |
| 70 left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \ | |
| 71 attrs_pattern | |
| 72 attrs_re = re.compile(attrs_pattern, re.VERBOSE) | |
| 73 left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) | |
| 74 markdown_in_raw = False | |
| 75 | |
| 76 def _get_left_tag(self, block): | |
| 77 m = self.left_tag_re.match(block) | |
| 78 if m: | |
| 79 tag = m.group('tag') | |
| 80 raw_attrs = m.group('attrs') | |
| 81 attrs = {} | |
| 82 if raw_attrs: | |
| 83 for ma in self.attrs_re.finditer(raw_attrs): | |
| 84 if ma.group('attr'): | |
| 85 if ma.group('value'): | |
| 86 attrs[ma.group('attr').strip()] = ma.group('value') | |
| 87 else: | |
| 88 attrs[ma.group('attr').strip()] = "" | |
| 89 elif ma.group('attr1'): | |
| 90 if ma.group('value1'): | |
| 91 attrs[ma.group('attr1').strip()] = ma.group( | |
| 92 'value1' | |
| 93 ) | |
| 94 else: | |
| 95 attrs[ma.group('attr1').strip()] = "" | |
| 96 elif ma.group('attr2'): | |
| 97 attrs[ma.group('attr2').strip()] = "" | |
| 98 return tag, len(m.group(0)), attrs | |
| 99 else: | |
| 100 tag = block[1:].split(">", 1)[0].lower() | |
| 101 return tag, len(tag)+2, {} | |
| 102 | |
| 103 def _recursive_tagfind(self, ltag, rtag, start_index, block): | |
| 104 while 1: | |
| 105 i = block.find(rtag, start_index) | |
| 106 if i == -1: | |
| 107 return -1 | |
| 108 j = block.find(ltag, start_index) | |
| 109 # if no ltag, or rtag found before another ltag, return index | |
| 110 if (j > i or j == -1): | |
| 111 return i + len(rtag) | |
| 112 # another ltag found before rtag, use end of ltag as starting | |
| 113 # point and search again | |
| 114 j = block.find('>', j) | |
| 115 start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) | |
| 116 if start_index == -1: | |
| 117 # HTML potentially malformed- ltag has no corresponding | |
| 118 # rtag | |
| 119 return -1 | |
| 120 | |
| 121 def _get_right_tag(self, left_tag, left_index, block): | |
| 122 for p in self.right_tag_patterns: | |
| 123 tag = p % left_tag | |
| 124 i = self._recursive_tagfind( | |
| 125 "<%s" % left_tag, tag, left_index, block | |
| 126 ) | |
| 127 if i > 2: | |
| 128 return tag.lstrip("<").rstrip(">"), i | |
| 129 return block.rstrip()[-left_index:-1].lower(), len(block) | |
| 130 | |
| 131 def _equal_tags(self, left_tag, right_tag): | |
| 132 if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. | |
| 133 return True | |
| 134 if ("/" + left_tag) == right_tag: | |
| 135 return True | |
| 136 if (right_tag == "--" and left_tag == "--"): | |
| 137 return True | |
| 138 elif left_tag == right_tag[1:] and right_tag[0] == "/": | |
| 139 return True | |
| 140 else: | |
| 141 return False | |
| 142 | |
| 143 def _is_oneliner(self, tag): | |
| 144 return (tag in ['hr', 'hr/']) | |
| 145 | |
| 146 def _stringindex_to_listindex(self, stringindex, items): | |
| 147 """ | |
| 148 Same effect as concatenating the strings in items, | |
| 149 finding the character to which stringindex refers in that string, | |
| 150 and returning the index of the item in which that character resides. | |
| 151 """ | |
| 152 items.append('dummy') | |
| 153 i, count = 0, 0 | |
| 154 while count <= stringindex: | |
| 155 count += len(items[i]) | |
| 156 i += 1 | |
| 157 return i - 1 | |
| 158 | |
| 159 def _nested_markdown_in_html(self, items): | |
| 160 """Find and process html child elements of the given element block.""" | |
| 161 for i, item in enumerate(items): | |
| 162 if self.left_tag_re.match(item): | |
| 163 left_tag, left_index, attrs = \ | |
| 164 self._get_left_tag(''.join(items[i:])) | |
| 165 right_tag, data_index = self._get_right_tag( | |
| 166 left_tag, left_index, ''.join(items[i:])) | |
| 167 right_listindex = \ | |
| 168 self._stringindex_to_listindex(data_index, items[i:]) + i | |
| 169 if 'markdown' in attrs.keys(): | |
| 170 items[i] = items[i][left_index:] # remove opening tag | |
| 171 placeholder = self.markdown.htmlStash.store_tag( | |
| 172 left_tag, attrs, i + 1, right_listindex + 1) | |
| 173 items.insert(i, placeholder) | |
| 174 if len(items) - right_listindex <= 1: # last nest, no tail | |
| 175 right_listindex -= 1 | |
| 176 items[right_listindex] = items[right_listindex][ | |
| 177 :-len(right_tag) - 2] # remove closing tag | |
| 178 else: # raw html | |
| 179 if len(items) - right_listindex <= 1: # last element | |
| 180 right_listindex -= 1 | |
| 181 offset = 1 if i == right_listindex else 0 | |
| 182 placeholder = self.markdown.htmlStash.store('\n\n'.join( | |
| 183 items[i:right_listindex + offset])) | |
| 184 del items[i:right_listindex + offset] | |
| 185 items.insert(i, placeholder) | |
| 186 return items | |
| 187 | |
| 188 def run(self, lines): | |
| 189 text = "\n".join(lines) | |
| 190 new_blocks = [] | |
| 191 text = text.rsplit("\n\n") | |
| 192 items = [] | |
| 193 left_tag = '' | |
| 194 right_tag = '' | |
| 195 in_tag = False # flag | |
| 196 | |
| 197 while text: | |
| 198 block = text[0] | |
| 199 if block.startswith("\n"): | |
| 200 block = block[1:] | |
| 201 text = text[1:] | |
| 202 | |
| 203 if block.startswith("\n"): | |
| 204 block = block[1:] | |
| 205 | |
| 206 if not in_tag: | |
| 207 if block.startswith("<") and len(block.strip()) > 1: | |
| 208 | |
| 209 if block[1:4] == "!--": | |
| 210 # is a comment block | |
| 211 left_tag, left_index, attrs = "--", 2, {} | |
| 212 else: | |
| 213 left_tag, left_index, attrs = self._get_left_tag(block) | |
| 214 right_tag, data_index = self._get_right_tag(left_tag, | |
| 215 left_index, | |
| 216 block) | |
| 217 # keep checking conditions below and maybe just append | |
| 218 | |
| 219 if data_index < len(block) and (util.isBlockLevel(left_tag)
or left_tag == '--'): | |
| 220 text.insert(0, block[data_index:]) | |
| 221 block = block[:data_index] | |
| 222 | |
| 223 if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?"
, "@", "%"]): | |
| 224 new_blocks.append(block) | |
| 225 continue | |
| 226 | |
| 227 if self._is_oneliner(left_tag): | |
| 228 new_blocks.append(block.strip()) | |
| 229 continue | |
| 230 | |
| 231 if block.rstrip().endswith(">") \ | |
| 232 and self._equal_tags(left_tag, right_tag): | |
| 233 if self.markdown_in_raw and 'markdown' in attrs.keys(): | |
| 234 block = block[left_index:-len(right_tag) - 2] | |
| 235 new_blocks.append(self.markdown.htmlStash. | |
| 236 store_tag(left_tag, attrs, 0, 2)) | |
| 237 new_blocks.extend([block]) | |
| 238 else: | |
| 239 new_blocks.append( | |
| 240 self.markdown.htmlStash.store(block.strip())) | |
| 241 continue | |
| 242 else: | |
| 243 # if is block level tag and is not complete | |
| 244 if (not self._equal_tags(left_tag, right_tag)) and \ | |
| 245 (util.isBlockLevel(left_tag) or left_tag == "--"): | |
| 246 items.append(block.strip()) | |
| 247 in_tag = True | |
| 248 else: | |
| 249 new_blocks.append( | |
| 250 self.markdown.htmlStash.store(block.strip()) | |
| 251 ) | |
| 252 continue | |
| 253 | |
| 254 else: | |
| 255 new_blocks.append(block) | |
| 256 | |
| 257 else: | |
| 258 items.append(block) | |
| 259 | |
| 260 right_tag, data_index = self._get_right_tag(left_tag, 0, block) | |
| 261 | |
| 262 if self._equal_tags(left_tag, right_tag): | |
| 263 # if find closing tag | |
| 264 | |
| 265 if data_index < len(block): | |
| 266 # we have more text after right_tag | |
| 267 items[-1] = block[:data_index] | |
| 268 text.insert(0, block[data_index:]) | |
| 269 | |
| 270 in_tag = False | |
| 271 if self.markdown_in_raw and 'markdown' in attrs.keys(): | |
| 272 items[0] = items[0][left_index:] | |
| 273 items[-1] = items[-1][:-len(right_tag) - 2] | |
| 274 if items[len(items) - 1]: # not a newline/empty string | |
| 275 right_index = len(items) + 3 | |
| 276 else: | |
| 277 right_index = len(items) + 2 | |
| 278 new_blocks.append(self.markdown.htmlStash.store_tag( | |
| 279 left_tag, attrs, 0, right_index)) | |
| 280 placeholderslen = len(self.markdown.htmlStash.tag_data) | |
| 281 new_blocks.extend( | |
| 282 self._nested_markdown_in_html(items)) | |
| 283 nests = len(self.markdown.htmlStash.tag_data) - \ | |
| 284 placeholderslen | |
| 285 self.markdown.htmlStash.tag_data[-1 - nests][ | |
| 286 'right_index'] += nests - 2 | |
| 287 else: | |
| 288 new_blocks.append( | |
| 289 self.markdown.htmlStash.store('\n\n'.join(items))) | |
| 290 items = [] | |
| 291 | |
| 292 if items: | |
| 293 if self.markdown_in_raw and 'markdown' in attrs.keys(): | |
| 294 items[0] = items[0][left_index:] | |
| 295 items[-1] = items[-1][:-len(right_tag) - 2] | |
| 296 if items[len(items) - 1]: # not a newline/empty string | |
| 297 right_index = len(items) + 3 | |
| 298 else: | |
| 299 right_index = len(items) + 2 | |
| 300 new_blocks.append( | |
| 301 self.markdown.htmlStash.store_tag( | |
| 302 left_tag, attrs, 0, right_index)) | |
| 303 placeholderslen = len(self.markdown.htmlStash.tag_data) | |
| 304 new_blocks.extend(self._nested_markdown_in_html(items)) | |
| 305 nests = len(self.markdown.htmlStash.tag_data) - placeholderslen | |
| 306 self.markdown.htmlStash.tag_data[-1 - nests][ | |
| 307 'right_index'] += nests - 2 | |
| 308 else: | |
| 309 new_blocks.append( | |
| 310 self.markdown.htmlStash.store('\n\n'.join(items))) | |
| 311 new_blocks.append('\n') | |
| 312 | |
| 313 new_text = "\n\n".join(new_blocks) | |
| 314 return new_text.split("\n") | |
| 315 | |
| 316 | |
| 317 class ReferencePreprocessor(Preprocessor): | |
| 318 """ Remove reference definitions from text and store for later use. """ | |
| 319 | |
| 320 TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*' | |
| 321 RE = re.compile( | |
| 322 r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL | |
| 323 ) | |
| 324 TITLE_RE = re.compile(r'^%s$' % TITLE) | |
| 325 | |
| 326 def run(self, lines): | |
| 327 new_text = [] | |
| 328 while lines: | |
| 329 line = lines.pop(0) | |
| 330 m = self.RE.match(line) | |
| 331 if m: | |
| 332 id = m.group(1).strip().lower() | |
| 333 link = m.group(2).lstrip('<').rstrip('>') | |
| 334 t = m.group(5) or m.group(6) or m.group(7) | |
| 335 if not t: | |
| 336 # Check next line for title | |
| 337 tm = self.TITLE_RE.match(lines[0]) | |
| 338 if tm: | |
| 339 lines.pop(0) | |
| 340 t = tm.group(2) or tm.group(3) or tm.group(4) | |
| 341 self.markdown.references[id] = (link, t) | |
| 342 else: | |
| 343 new_text.append(line) | |
| 344 | |
| 345 return new_text # + "\n" | |
| OLD | NEW |