OLD | NEW |
| (Empty) |
1 """ | |
2 PRE-PROCESSORS | |
3 ============================================================================= | |
4 | |
5 Preprocessors work on source text before we start doing anything too | |
6 complicated. | |
7 """ | |
8 | |
9 from __future__ import absolute_import | |
10 from __future__ import unicode_literals | |
11 from . import util | |
12 from . import odict | |
13 import re | |
14 | |
15 | |
16 def build_preprocessors(md_instance, **kwargs): | |
17 """ Build the default set of preprocessors used by Markdown. """ | |
18 preprocessors = odict.OrderedDict() | |
19 preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance) | |
20 if md_instance.safeMode != 'escape': | |
21 preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance) | |
22 preprocessors["reference"] = ReferencePreprocessor(md_instance) | |
23 return preprocessors | |
24 | |
25 | |
26 class Preprocessor(util.Processor): | |
27 """ | |
28 Preprocessors are run after the text is broken into lines. | |
29 | |
30 Each preprocessor implements a "run" method that takes a pointer to a | |
31 list of lines of the document, modifies it as necessary and returns | |
32 either the same pointer or a pointer to a new list. | |
33 | |
34 Preprocessors must extend markdown.Preprocessor. | |
35 | |
36 """ | |
37 def run(self, lines): | |
38 """ | |
39 Each subclass of Preprocessor should override the `run` method, which | |
40 takes the document as a list of strings split by newlines and returns | |
41 the (possibly modified) list of lines. | |
42 | |
43 """ | |
44 pass | |
45 | |
46 | |
47 class NormalizeWhitespace(Preprocessor): | |
48 """ Normalize whitespace for consistant parsing. """ | |
49 | |
50 def run(self, lines): | |
51 source = '\n'.join(lines) | |
52 source = source.replace(util.STX, "").replace(util.ETX, "") | |
53 source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" | |
54 source = source.expandtabs(self.markdown.tab_length) | |
55 source = re.sub(r'(?<=\n) +\n', '\n', source) | |
56 return source.split('\n') | |
57 | |
58 | |
59 class HtmlBlockPreprocessor(Preprocessor): | |
60 """Remove html blocks from the text and store them for later retrieval.""" | |
61 | |
62 right_tag_patterns = ["</%s>", "%s>"] | |
63 attrs_pattern = r""" | |
64 \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" | |
65 | # OR | |
66 \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value | |
67 | # OR | |
68 \s+(?P<attr2>[^>"'/= ]+) # attr | |
69 """ | |
70 left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pa
ttern | |
71 attrs_re = re.compile(attrs_pattern, re.VERBOSE) | |
72 left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) | |
73 markdown_in_raw = False | |
74 | |
75 def _get_left_tag(self, block): | |
76 m = self.left_tag_re.match(block) | |
77 if m: | |
78 tag = m.group('tag') | |
79 raw_attrs = m.group('attrs') | |
80 attrs = {} | |
81 if raw_attrs: | |
82 for ma in self.attrs_re.finditer(raw_attrs): | |
83 if ma.group('attr'): | |
84 if ma.group('value'): | |
85 attrs[ma.group('attr').strip()] = ma.group('value') | |
86 else: | |
87 attrs[ma.group('attr').strip()] = "" | |
88 elif ma.group('attr1'): | |
89 if ma.group('value1'): | |
90 attrs[ma.group('attr1').strip()] = ma.group('value1'
) | |
91 else: | |
92 attrs[ma.group('attr1').strip()] = "" | |
93 elif ma.group('attr2'): | |
94 attrs[ma.group('attr2').strip()] = "" | |
95 return tag, len(m.group(0)), attrs | |
96 else: | |
97 tag = block[1:].split(">", 1)[0].lower() | |
98 return tag, len(tag)+2, {} | |
99 | |
100 def _recursive_tagfind(self, ltag, rtag, start_index, block): | |
101 while 1: | |
102 i = block.find(rtag, start_index) | |
103 if i == -1: | |
104 return -1 | |
105 j = block.find(ltag, start_index) | |
106 # if no ltag, or rtag found before another ltag, return index | |
107 if (j > i or j == -1): | |
108 return i + len(rtag) | |
109 # another ltag found before rtag, use end of ltag as starting | |
110 # point and search again | |
111 j = block.find('>', j) | |
112 start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) | |
113 if start_index == -1: | |
114 # HTML potentially malformed- ltag has no corresponding | |
115 # rtag | |
116 return -1 | |
117 | |
118 def _get_right_tag(self, left_tag, left_index, block): | |
119 for p in self.right_tag_patterns: | |
120 tag = p % left_tag | |
121 i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block
) | |
122 if i > 2: | |
123 return tag.lstrip("<").rstrip(">"), i | |
124 return block.rstrip()[-left_index:-1].lower(), len(block) | |
125 | |
126 def _equal_tags(self, left_tag, right_tag): | |
127 if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. | |
128 return True | |
129 if ("/" + left_tag) == right_tag: | |
130 return True | |
131 if (right_tag == "--" and left_tag == "--"): | |
132 return True | |
133 elif left_tag == right_tag[1:] \ | |
134 and right_tag[0] == "/": | |
135 return True | |
136 else: | |
137 return False | |
138 | |
139 def _is_oneliner(self, tag): | |
140 return (tag in ['hr', 'hr/']) | |
141 | |
142 def run(self, lines): | |
143 text = "\n".join(lines) | |
144 new_blocks = [] | |
145 text = text.rsplit("\n\n") | |
146 items = [] | |
147 left_tag = '' | |
148 right_tag = '' | |
149 in_tag = False # flag | |
150 | |
151 while text: | |
152 block = text[0] | |
153 if block.startswith("\n"): | |
154 block = block[1:] | |
155 text = text[1:] | |
156 | |
157 if block.startswith("\n"): | |
158 block = block[1:] | |
159 | |
160 if not in_tag: | |
161 if block.startswith("<") and len(block.strip()) > 1: | |
162 | |
163 if block[1] == "!": | |
164 # is a comment block | |
165 left_tag, left_index, attrs = "--", 2, {} | |
166 else: | |
167 left_tag, left_index, attrs = self._get_left_tag(block) | |
168 right_tag, data_index = self._get_right_tag(left_tag, | |
169 left_index, | |
170 block) | |
171 # keep checking conditions below and maybe just append | |
172 | |
173 if data_index < len(block) \ | |
174 and (util.isBlockLevel(left_tag) | |
175 or left_tag == '--'): | |
176 text.insert(0, block[data_index:]) | |
177 block = block[:data_index] | |
178 | |
179 if not (util.isBlockLevel(left_tag) \ | |
180 or block[1] in ["!", "?", "@", "%"]): | |
181 new_blocks.append(block) | |
182 continue | |
183 | |
184 if self._is_oneliner(left_tag): | |
185 new_blocks.append(block.strip()) | |
186 continue | |
187 | |
188 if block.rstrip().endswith(">") \ | |
189 and self._equal_tags(left_tag, right_tag): | |
190 if self.markdown_in_raw and 'markdown' in attrs.keys(): | |
191 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', | |
192 '', block[:left_index]) | |
193 end = block[-len(right_tag)-2:] | |
194 block = block[left_index:-len(right_tag)-2] | |
195 new_blocks.append( | |
196 self.markdown.htmlStash.store(start)) | |
197 new_blocks.append(block) | |
198 new_blocks.append( | |
199 self.markdown.htmlStash.store(end)) | |
200 else: | |
201 new_blocks.append( | |
202 self.markdown.htmlStash.store(block.strip())) | |
203 continue | |
204 else: | |
205 # if is block level tag and is not complete | |
206 | |
207 if util.isBlockLevel(left_tag) or left_tag == "--" \ | |
208 and not block.rstrip().endswith(">"): | |
209 items.append(block.strip()) | |
210 in_tag = True | |
211 else: | |
212 new_blocks.append( | |
213 self.markdown.htmlStash.store(block.strip())) | |
214 | |
215 continue | |
216 | |
217 new_blocks.append(block) | |
218 | |
219 else: | |
220 items.append(block) | |
221 | |
222 right_tag, data_index = self._get_right_tag(left_tag, 0, block) | |
223 | |
224 if self._equal_tags(left_tag, right_tag): | |
225 # if find closing tag | |
226 | |
227 if data_index < len(block): | |
228 # we have more text after right_tag | |
229 items[-1] = block[:data_index] | |
230 text.insert(0, block[data_index:]) | |
231 | |
232 in_tag = False | |
233 if self.markdown_in_raw and 'markdown' in attrs.keys(): | |
234 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', | |
235 '', items[0][:left_index]) | |
236 items[0] = items[0][left_index:] | |
237 end = items[-1][-len(right_tag)-2:] | |
238 items[-1] = items[-1][:-len(right_tag)-2] | |
239 new_blocks.append( | |
240 self.markdown.htmlStash.store(start)) | |
241 new_blocks.extend(items) | |
242 new_blocks.append( | |
243 self.markdown.htmlStash.store(end)) | |
244 else: | |
245 new_blocks.append( | |
246 self.markdown.htmlStash.store('\n\n'.join(items))) | |
247 items = [] | |
248 | |
249 if items: | |
250 if self.markdown_in_raw and 'markdown' in attrs.keys(): | |
251 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', | |
252 '', items[0][:left_index]) | |
253 items[0] = items[0][left_index:] | |
254 end = items[-1][-len(right_tag)-2:] | |
255 items[-1] = items[-1][:-len(right_tag)-2] | |
256 new_blocks.append( | |
257 self.markdown.htmlStash.store(start)) | |
258 new_blocks.extend(items) | |
259 if end.strip(): | |
260 new_blocks.append( | |
261 self.markdown.htmlStash.store(end)) | |
262 else: | |
263 new_blocks.append( | |
264 self.markdown.htmlStash.store('\n\n'.join(items))) | |
265 #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))
) | |
266 new_blocks.append('\n') | |
267 | |
268 new_text = "\n\n".join(new_blocks) | |
269 return new_text.split("\n") | |
270 | |
271 | |
272 class ReferencePreprocessor(Preprocessor): | |
273 """ Remove reference definitions from text and store for later use. """ | |
274 | |
275 TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*' | |
276 RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DO
TALL) | |
277 TITLE_RE = re.compile(r'^%s$' % TITLE) | |
278 | |
279 def run (self, lines): | |
280 new_text = []; | |
281 while lines: | |
282 line = lines.pop(0) | |
283 m = self.RE.match(line) | |
284 if m: | |
285 id = m.group(1).strip().lower() | |
286 link = m.group(2).lstrip('<').rstrip('>') | |
287 t = m.group(5) or m.group(6) or m.group(7) | |
288 if not t: | |
289 # Check next line for title | |
290 tm = self.TITLE_RE.match(lines[0]) | |
291 if tm: | |
292 lines.pop(0) | |
293 t = tm.group(2) or tm.group(3) or tm.group(4) | |
294 self.markdown.references[id] = (link, t) | |
295 else: | |
296 new_text.append(line) | |
297 | |
298 return new_text #+ "\n" | |
OLD | NEW |