OLD | NEW |
| (Empty) |
1 # markdown/searializers.py | |
2 # | |
3 # Add x/html serialization to Elementree | |
4 # Taken from ElementTree 1.3 preview with slight modifications | |
5 # | |
6 # Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved. | |
7 # | |
8 # fredrik@pythonware.com | |
9 # http://www.pythonware.com | |
10 # | |
11 # -------------------------------------------------------------------- | |
12 # The ElementTree toolkit is | |
13 # | |
14 # Copyright (c) 1999-2007 by Fredrik Lundh | |
15 # | |
16 # By obtaining, using, and/or copying this software and/or its | |
17 # associated documentation, you agree that you have read, understood, | |
18 # and will comply with the following terms and conditions: | |
19 # | |
20 # Permission to use, copy, modify, and distribute this software and | |
21 # its associated documentation for any purpose and without fee is | |
22 # hereby granted, provided that the above copyright notice appears in | |
23 # all copies, and that both that copyright notice and this permission | |
24 # notice appear in supporting documentation, and that the name of | |
25 # Secret Labs AB or the author not be used in advertising or publicity | |
26 # pertaining to distribution of the software without specific, written | |
27 # prior permission. | |
28 # | |
29 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD | |
30 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- | |
31 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR | |
32 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY | |
33 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, | |
34 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS | |
35 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE | |
36 # OF THIS SOFTWARE. | |
37 # -------------------------------------------------------------------- | |
38 | |
39 | |
40 from __future__ import absolute_import | |
41 from __future__ import unicode_literals | |
42 from . import util | |
43 ElementTree = util.etree.ElementTree | |
44 QName = util.etree.QName | |
45 if hasattr(util.etree, 'test_comment'): | |
46 Comment = util.etree.test_comment | |
47 else: | |
48 Comment = util.etree.Comment | |
49 PI = util.etree.PI | |
50 ProcessingInstruction = util.etree.ProcessingInstruction | |
51 | |
52 __all__ = ['to_html_string', 'to_xhtml_string'] | |
53 | |
54 HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", | |
55 "img", "input", "isindex", "link", "meta" "param") | |
56 | |
57 try: | |
58 HTML_EMPTY = set(HTML_EMPTY) | |
59 except NameError: | |
60 pass | |
61 | |
62 _namespace_map = { | |
63 # "well-known" namespace prefixes | |
64 "http://www.w3.org/XML/1998/namespace": "xml", | |
65 "http://www.w3.org/1999/xhtml": "html", | |
66 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", | |
67 "http://schemas.xmlsoap.org/wsdl/": "wsdl", | |
68 # xml schema | |
69 "http://www.w3.org/2001/XMLSchema": "xs", | |
70 "http://www.w3.org/2001/XMLSchema-instance": "xsi", | |
71 # dublic core | |
72 "http://purl.org/dc/elements/1.1/": "dc", | |
73 } | |
74 | |
75 | |
76 def _raise_serialization_error(text): | |
77 raise TypeError( | |
78 "cannot serialize %r (type %s)" % (text, type(text).__name__) | |
79 ) | |
80 | |
81 def _encode(text, encoding): | |
82 try: | |
83 return text.encode(encoding, "xmlcharrefreplace") | |
84 except (TypeError, AttributeError): | |
85 _raise_serialization_error(text) | |
86 | |
87 def _escape_cdata(text): | |
88 # escape character data | |
89 try: | |
90 # it's worth avoiding do-nothing calls for strings that are | |
91 # shorter than 500 character, or so. assume that's, by far, | |
92 # the most common case in most applications. | |
93 if "&" in text: | |
94 text = text.replace("&", "&") | |
95 if "<" in text: | |
96 text = text.replace("<", "<") | |
97 if ">" in text: | |
98 text = text.replace(">", ">") | |
99 return text | |
100 except (TypeError, AttributeError): | |
101 _raise_serialization_error(text) | |
102 | |
103 | |
104 def _escape_attrib(text): | |
105 # escape attribute value | |
106 try: | |
107 if "&" in text: | |
108 text = text.replace("&", "&") | |
109 if "<" in text: | |
110 text = text.replace("<", "<") | |
111 if ">" in text: | |
112 text = text.replace(">", ">") | |
113 if "\"" in text: | |
114 text = text.replace("\"", """) | |
115 if "\n" in text: | |
116 text = text.replace("\n", " ") | |
117 return text | |
118 except (TypeError, AttributeError): | |
119 _raise_serialization_error(text) | |
120 | |
121 def _escape_attrib_html(text): | |
122 # escape attribute value | |
123 try: | |
124 if "&" in text: | |
125 text = text.replace("&", "&") | |
126 if "<" in text: | |
127 text = text.replace("<", "<") | |
128 if ">" in text: | |
129 text = text.replace(">", ">") | |
130 if "\"" in text: | |
131 text = text.replace("\"", """) | |
132 return text | |
133 except (TypeError, AttributeError): | |
134 _raise_serialization_error(text) | |
135 | |
136 | |
137 def _serialize_html(write, elem, qnames, namespaces, format): | |
138 tag = elem.tag | |
139 text = elem.text | |
140 if tag is Comment: | |
141 write("<!--%s-->" % _escape_cdata(text)) | |
142 elif tag is ProcessingInstruction: | |
143 write("<?%s?>" % _escape_cdata(text)) | |
144 else: | |
145 tag = qnames[tag] | |
146 if tag is None: | |
147 if text: | |
148 write(_escape_cdata(text)) | |
149 for e in elem: | |
150 _serialize_html(write, e, qnames, None, format) | |
151 else: | |
152 write("<" + tag) | |
153 items = elem.items() | |
154 if items or namespaces: | |
155 items.sort() # lexical order | |
156 for k, v in items: | |
157 if isinstance(k, QName): | |
158 k = k.text | |
159 if isinstance(v, QName): | |
160 v = qnames[v.text] | |
161 else: | |
162 v = _escape_attrib_html(v) | |
163 if qnames[k] == v and format == 'html': | |
164 # handle boolean attributes | |
165 write(" %s" % v) | |
166 else: | |
167 write(" %s=\"%s\"" % (qnames[k], v)) | |
168 if namespaces: | |
169 items = namespaces.items() | |
170 items.sort(key=lambda x: x[1]) # sort on prefix | |
171 for v, k in items: | |
172 if k: | |
173 k = ":" + k | |
174 write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v))) | |
175 if format == "xhtml" and tag in HTML_EMPTY: | |
176 write(" />") | |
177 else: | |
178 write(">") | |
179 tag = tag.lower() | |
180 if text: | |
181 if tag == "script" or tag == "style": | |
182 write(text) | |
183 else: | |
184 write(_escape_cdata(text)) | |
185 for e in elem: | |
186 _serialize_html(write, e, qnames, None, format) | |
187 if tag not in HTML_EMPTY: | |
188 write("</" + tag + ">") | |
189 if elem.tail: | |
190 write(_escape_cdata(elem.tail)) | |
191 | |
192 def _write_html(root, | |
193 encoding=None, | |
194 default_namespace=None, | |
195 format="html"): | |
196 assert root is not None | |
197 data = [] | |
198 write = data.append | |
199 qnames, namespaces = _namespaces(root, default_namespace) | |
200 _serialize_html(write, root, qnames, namespaces, format) | |
201 if encoding is None: | |
202 return "".join(data) | |
203 else: | |
204 return _encode("".join(data)) | |
205 | |
206 | |
207 # -------------------------------------------------------------------- | |
208 # serialization support | |
209 | |
210 def _namespaces(elem, default_namespace=None): | |
211 # identify namespaces used in this tree | |
212 | |
213 # maps qnames to *encoded* prefix:local names | |
214 qnames = {None: None} | |
215 | |
216 # maps uri:s to prefixes | |
217 namespaces = {} | |
218 if default_namespace: | |
219 namespaces[default_namespace] = "" | |
220 | |
221 def add_qname(qname): | |
222 # calculate serialized qname representation | |
223 try: | |
224 if qname[:1] == "{": | |
225 uri, tag = qname[1:].split("}", 1) | |
226 prefix = namespaces.get(uri) | |
227 if prefix is None: | |
228 prefix = _namespace_map.get(uri) | |
229 if prefix is None: | |
230 prefix = "ns%d" % len(namespaces) | |
231 if prefix != "xml": | |
232 namespaces[uri] = prefix | |
233 if prefix: | |
234 qnames[qname] = "%s:%s" % (prefix, tag) | |
235 else: | |
236 qnames[qname] = tag # default element | |
237 else: | |
238 if default_namespace: | |
239 raise ValueError( | |
240 "cannot use non-qualified names with " | |
241 "default_namespace option" | |
242 ) | |
243 qnames[qname] = qname | |
244 except TypeError: | |
245 _raise_serialization_error(qname) | |
246 | |
247 # populate qname and namespaces table | |
248 try: | |
249 iterate = elem.iter | |
250 except AttributeError: | |
251 iterate = elem.getiterator # cET compatibility | |
252 for elem in iterate(): | |
253 tag = elem.tag | |
254 if isinstance(tag, QName) and tag.text not in qnames: | |
255 add_qname(tag.text) | |
256 elif isinstance(tag, util.string_type): | |
257 if tag not in qnames: | |
258 add_qname(tag) | |
259 elif tag is not None and tag is not Comment and tag is not PI: | |
260 _raise_serialization_error(tag) | |
261 for key, value in elem.items(): | |
262 if isinstance(key, QName): | |
263 key = key.text | |
264 if key not in qnames: | |
265 add_qname(key) | |
266 if isinstance(value, QName) and value.text not in qnames: | |
267 add_qname(value.text) | |
268 text = elem.text | |
269 if isinstance(text, QName) and text.text not in qnames: | |
270 add_qname(text.text) | |
271 return qnames, namespaces | |
272 | |
273 def to_html_string(element): | |
274 return _write_html(ElementTree(element).getroot(), format="html") | |
275 | |
276 def to_xhtml_string(element): | |
277 return _write_html(ElementTree(element).getroot(), format="xhtml") | |
OLD | NEW |