OLD | NEW |
| (Empty) |
1 """ | |
2 PRE-PROCESSORS | |
3 ============================================================================= | |
4 | |
5 Preprocessors work on source text before we start doing anything too | |
6 complicated. | |
7 """ | |
8 | |
9 from __future__ import absolute_import | |
10 from __future__ import unicode_literals | |
11 from . import util | |
12 from . import odict | |
13 import re | |
14 | |
15 | |
16 def build_preprocessors(md_instance, **kwargs): | |
17 """ Build the default set of preprocessors used by Markdown. """ | |
18 preprocessors = odict.OrderedDict() | |
19 preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance) | |
20 if md_instance.safeMode != 'escape': | |
21 preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance) | |
22 preprocessors["reference"] = ReferencePreprocessor(md_instance) | |
23 return preprocessors | |
24 | |
25 | |
26 class Preprocessor(util.Processor): | |
27 """ | |
28 Preprocessors are run after the text is broken into lines. | |
29 | |
30 Each preprocessor implements a "run" method that takes a pointer to a | |
31 list of lines of the document, modifies it as necessary and returns | |
32 either the same pointer or a pointer to a new list. | |
33 | |
34 Preprocessors must extend markdown.Preprocessor. | |
35 | |
36 """ | |
37 def run(self, lines): | |
38 """ | |
39 Each subclass of Preprocessor should override the `run` method, which | |
40 takes the document as a list of strings split by newlines and returns | |
41 the (possibly modified) list of lines. | |
42 | |
43 """ | |
44 pass # pragma: no cover | |
45 | |
46 | |
47 class NormalizeWhitespace(Preprocessor): | |
48 """ Normalize whitespace for consistant parsing. """ | |
49 | |
50 def run(self, lines): | |
51 source = '\n'.join(lines) | |
52 source = source.replace(util.STX, "").replace(util.ETX, "") | |
53 source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" | |
54 source = source.expandtabs(self.markdown.tab_length) | |
55 source = re.sub(r'(?<=\n) +\n', '\n', source) | |
56 return source.split('\n') | |
57 | |
58 | |
59 class HtmlBlockPreprocessor(Preprocessor): | |
60 """Remove html blocks from the text and store them for later retrieval.""" | |
61 | |
62 right_tag_patterns = ["</%s>", "%s>"] | |
63 attrs_pattern = r""" | |
64 \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" | |
65 | # OR | |
66 \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value | |
67 | # OR | |
68 \s+(?P<attr2>[^>"'/= ]+) # attr | |
69 """ | |
70 left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \ | |
71 attrs_pattern | |
72 attrs_re = re.compile(attrs_pattern, re.VERBOSE) | |
73 left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) | |
74 markdown_in_raw = False | |
75 | |
76 def _get_left_tag(self, block): | |
77 m = self.left_tag_re.match(block) | |
78 if m: | |
79 tag = m.group('tag') | |
80 raw_attrs = m.group('attrs') | |
81 attrs = {} | |
82 if raw_attrs: | |
83 for ma in self.attrs_re.finditer(raw_attrs): | |
84 if ma.group('attr'): | |
85 if ma.group('value'): | |
86 attrs[ma.group('attr').strip()] = ma.group('value') | |
87 else: | |
88 attrs[ma.group('attr').strip()] = "" | |
89 elif ma.group('attr1'): | |
90 if ma.group('value1'): | |
91 attrs[ma.group('attr1').strip()] = ma.group( | |
92 'value1' | |
93 ) | |
94 else: | |
95 attrs[ma.group('attr1').strip()] = "" | |
96 elif ma.group('attr2'): | |
97 attrs[ma.group('attr2').strip()] = "" | |
98 return tag, len(m.group(0)), attrs | |
99 else: | |
100 tag = block[1:].split(">", 1)[0].lower() | |
101 return tag, len(tag)+2, {} | |
102 | |
103 def _recursive_tagfind(self, ltag, rtag, start_index, block): | |
104 while 1: | |
105 i = block.find(rtag, start_index) | |
106 if i == -1: | |
107 return -1 | |
108 j = block.find(ltag, start_index) | |
109 # if no ltag, or rtag found before another ltag, return index | |
110 if (j > i or j == -1): | |
111 return i + len(rtag) | |
112 # another ltag found before rtag, use end of ltag as starting | |
113 # point and search again | |
114 j = block.find('>', j) | |
115 start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) | |
116 if start_index == -1: | |
117 # HTML potentially malformed- ltag has no corresponding | |
118 # rtag | |
119 return -1 | |
120 | |
121 def _get_right_tag(self, left_tag, left_index, block): | |
122 for p in self.right_tag_patterns: | |
123 tag = p % left_tag | |
124 i = self._recursive_tagfind( | |
125 "<%s" % left_tag, tag, left_index, block | |
126 ) | |
127 if i > 2: | |
128 return tag.lstrip("<").rstrip(">"), i | |
129 return block.rstrip()[-left_index:-1].lower(), len(block) | |
130 | |
131 def _equal_tags(self, left_tag, right_tag): | |
132 if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. | |
133 return True | |
134 if ("/" + left_tag) == right_tag: | |
135 return True | |
136 if (right_tag == "--" and left_tag == "--"): | |
137 return True | |
138 elif left_tag == right_tag[1:] and right_tag[0] == "/": | |
139 return True | |
140 else: | |
141 return False | |
142 | |
143 def _is_oneliner(self, tag): | |
144 return (tag in ['hr', 'hr/']) | |
145 | |
146 def _stringindex_to_listindex(self, stringindex, items): | |
147 """ | |
148 Same effect as concatenating the strings in items, | |
149 finding the character to which stringindex refers in that string, | |
150 and returning the index of the item in which that character resides. | |
151 """ | |
152 items.append('dummy') | |
153 i, count = 0, 0 | |
154 while count <= stringindex: | |
155 count += len(items[i]) | |
156 i += 1 | |
157 return i - 1 | |
158 | |
159 def _nested_markdown_in_html(self, items): | |
160 """Find and process html child elements of the given element block.""" | |
161 for i, item in enumerate(items): | |
162 if self.left_tag_re.match(item): | |
163 left_tag, left_index, attrs = \ | |
164 self._get_left_tag(''.join(items[i:])) | |
165 right_tag, data_index = self._get_right_tag( | |
166 left_tag, left_index, ''.join(items[i:])) | |
167 right_listindex = \ | |
168 self._stringindex_to_listindex(data_index, items[i:]) + i | |
169 if 'markdown' in attrs.keys(): | |
170 items[i] = items[i][left_index:] # remove opening tag | |
171 placeholder = self.markdown.htmlStash.store_tag( | |
172 left_tag, attrs, i + 1, right_listindex + 1) | |
173 items.insert(i, placeholder) | |
174 if len(items) - right_listindex <= 1: # last nest, no tail | |
175 right_listindex -= 1 | |
176 items[right_listindex] = items[right_listindex][ | |
177 :-len(right_tag) - 2] # remove closing tag | |
178 else: # raw html | |
179 if len(items) - right_listindex <= 1: # last element | |
180 right_listindex -= 1 | |
181 offset = 1 if i == right_listindex else 0 | |
182 placeholder = self.markdown.htmlStash.store('\n\n'.join( | |
183 items[i:right_listindex + offset])) | |
184 del items[i:right_listindex + offset] | |
185 items.insert(i, placeholder) | |
186 return items | |
187 | |
188 def run(self, lines): | |
189 text = "\n".join(lines) | |
190 new_blocks = [] | |
191 text = text.rsplit("\n\n") | |
192 items = [] | |
193 left_tag = '' | |
194 right_tag = '' | |
195 in_tag = False # flag | |
196 | |
197 while text: | |
198 block = text[0] | |
199 if block.startswith("\n"): | |
200 block = block[1:] | |
201 text = text[1:] | |
202 | |
203 if block.startswith("\n"): | |
204 block = block[1:] | |
205 | |
206 if not in_tag: | |
207 if block.startswith("<") and len(block.strip()) > 1: | |
208 | |
209 if block[1:4] == "!--": | |
210 # is a comment block | |
211 left_tag, left_index, attrs = "--", 2, {} | |
212 else: | |
213 left_tag, left_index, attrs = self._get_left_tag(block) | |
214 right_tag, data_index = self._get_right_tag(left_tag, | |
215 left_index, | |
216 block) | |
217 # keep checking conditions below and maybe just append | |
218 | |
219 if data_index < len(block) and (util.isBlockLevel(left_tag)
or left_tag == '--'): | |
220 text.insert(0, block[data_index:]) | |
221 block = block[:data_index] | |
222 | |
223 if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?"
, "@", "%"]): | |
224 new_blocks.append(block) | |
225 continue | |
226 | |
227 if self._is_oneliner(left_tag): | |
228 new_blocks.append(block.strip()) | |
229 continue | |
230 | |
231 if block.rstrip().endswith(">") \ | |
232 and self._equal_tags(left_tag, right_tag): | |
233 if self.markdown_in_raw and 'markdown' in attrs.keys(): | |
234 block = block[left_index:-len(right_tag) - 2] | |
235 new_blocks.append(self.markdown.htmlStash. | |
236 store_tag(left_tag, attrs, 0, 2)) | |
237 new_blocks.extend([block]) | |
238 else: | |
239 new_blocks.append( | |
240 self.markdown.htmlStash.store(block.strip())) | |
241 continue | |
242 else: | |
243 # if is block level tag and is not complete | |
244 if (not self._equal_tags(left_tag, right_tag)) and \ | |
245 (util.isBlockLevel(left_tag) or left_tag == "--"): | |
246 items.append(block.strip()) | |
247 in_tag = True | |
248 else: | |
249 new_blocks.append( | |
250 self.markdown.htmlStash.store(block.strip()) | |
251 ) | |
252 continue | |
253 | |
254 else: | |
255 new_blocks.append(block) | |
256 | |
257 else: | |
258 items.append(block) | |
259 | |
260 right_tag, data_index = self._get_right_tag(left_tag, 0, block) | |
261 | |
262 if self._equal_tags(left_tag, right_tag): | |
263 # if find closing tag | |
264 | |
265 if data_index < len(block): | |
266 # we have more text after right_tag | |
267 items[-1] = block[:data_index] | |
268 text.insert(0, block[data_index:]) | |
269 | |
270 in_tag = False | |
271 if self.markdown_in_raw and 'markdown' in attrs.keys(): | |
272 items[0] = items[0][left_index:] | |
273 items[-1] = items[-1][:-len(right_tag) - 2] | |
274 if items[len(items) - 1]: # not a newline/empty string | |
275 right_index = len(items) + 3 | |
276 else: | |
277 right_index = len(items) + 2 | |
278 new_blocks.append(self.markdown.htmlStash.store_tag( | |
279 left_tag, attrs, 0, right_index)) | |
280 placeholderslen = len(self.markdown.htmlStash.tag_data) | |
281 new_blocks.extend( | |
282 self._nested_markdown_in_html(items)) | |
283 nests = len(self.markdown.htmlStash.tag_data) - \ | |
284 placeholderslen | |
285 self.markdown.htmlStash.tag_data[-1 - nests][ | |
286 'right_index'] += nests - 2 | |
287 else: | |
288 new_blocks.append( | |
289 self.markdown.htmlStash.store('\n\n'.join(items))) | |
290 items = [] | |
291 | |
292 if items: | |
293 if self.markdown_in_raw and 'markdown' in attrs.keys(): | |
294 items[0] = items[0][left_index:] | |
295 items[-1] = items[-1][:-len(right_tag) - 2] | |
296 if items[len(items) - 1]: # not a newline/empty string | |
297 right_index = len(items) + 3 | |
298 else: | |
299 right_index = len(items) + 2 | |
300 new_blocks.append( | |
301 self.markdown.htmlStash.store_tag( | |
302 left_tag, attrs, 0, right_index)) | |
303 placeholderslen = len(self.markdown.htmlStash.tag_data) | |
304 new_blocks.extend(self._nested_markdown_in_html(items)) | |
305 nests = len(self.markdown.htmlStash.tag_data) - placeholderslen | |
306 self.markdown.htmlStash.tag_data[-1 - nests][ | |
307 'right_index'] += nests - 2 | |
308 else: | |
309 new_blocks.append( | |
310 self.markdown.htmlStash.store('\n\n'.join(items))) | |
311 new_blocks.append('\n') | |
312 | |
313 new_text = "\n\n".join(new_blocks) | |
314 return new_text.split("\n") | |
315 | |
316 | |
317 class ReferencePreprocessor(Preprocessor): | |
318 """ Remove reference definitions from text and store for later use. """ | |
319 | |
320 TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*' | |
321 RE = re.compile( | |
322 r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL | |
323 ) | |
324 TITLE_RE = re.compile(r'^%s$' % TITLE) | |
325 | |
326 def run(self, lines): | |
327 new_text = [] | |
328 while lines: | |
329 line = lines.pop(0) | |
330 m = self.RE.match(line) | |
331 if m: | |
332 id = m.group(1).strip().lower() | |
333 link = m.group(2).lstrip('<').rstrip('>') | |
334 t = m.group(5) or m.group(6) or m.group(7) | |
335 if not t: | |
336 # Check next line for title | |
337 tm = self.TITLE_RE.match(lines[0]) | |
338 if tm: | |
339 lines.pop(0) | |
340 t = tm.group(2) or tm.group(3) or tm.group(4) | |
341 self.markdown.references[id] = (link, t) | |
342 else: | |
343 new_text.append(line) | |
344 | |
345 return new_text # + "\n" | |
OLD | NEW |