OLD | NEW |
(Empty) | |
| 1 # -*- coding: iso-8859-1 -*- |
| 2 """ A SAX2 driver for libxml2, on top of it's XmlReader API |
| 3 |
| 4 USAGE |
| 5 # put this file (drv_libxml2.py) in PYTHONPATH |
| 6 import xml.sax |
| 7 reader = xml.sax.make_parser(["drv_libxml2"]) |
| 8 # ...and the rest is standard python sax. |
| 9 |
| 10 CAVEATS |
| 11 - Lexical handlers are supported, except for start/endEntity |
| 12 (waiting for XmlReader.ResolveEntity) and start/endDTD |
| 13 - Error callbacks are not exactly synchronous, they tend |
| 14 to be invoked before the corresponding content callback, |
| 15 because the underlying reader interface parses |
| 16 data by chunks of 512 bytes |
| 17 |
| 18 TODO |
| 19 - search for TODO |
| 20 - some ErrorHandler events (warning) |
| 21 - some ContentHandler events (setDocumentLocator, skippedEntity) |
| 22 - EntityResolver (using libxml2.?) |
| 23 - DTDHandler (if/when libxml2 exposes such node types) |
| 24 - DeclHandler (if/when libxml2 exposes such node types) |
| 25 - property_xml_string? |
| 26 - feature_string_interning? |
| 27 - Incremental parser |
| 28 - additional performance tuning: |
| 29 - one might cache callbacks to avoid some name lookups |
| 30 - one might implement a smarter way to pass attributes to startElement |
| 31 (some kind of lazy evaluation?) |
| 32 - there might be room for improvement in start/endPrefixMapping |
| 33 - other? |
| 34 |
| 35 """ |
| 36 |
| 37 __author__ = "Stéphane Bidoul <sbi@skynet.be>" |
| 38 __version__ = "0.3" |
| 39 |
| 40 import sys |
| 41 import codecs |
| 42 |
| 43 if sys.version_info[0] < 3: |
| 44 __author__ = codecs.unicode_escape_decode(__author__)[0] |
| 45 |
| 46 StringTypes = (str, unicode) |
| 47 else: |
| 48 StringTypes = str |
| 49 |
| 50 from xml.sax._exceptions import * |
| 51 from xml.sax import xmlreader, saxutils |
| 52 from xml.sax.handler import \ |
| 53 feature_namespaces, \ |
| 54 feature_namespace_prefixes, \ |
| 55 feature_string_interning, \ |
| 56 feature_validation, \ |
| 57 feature_external_ges, \ |
| 58 feature_external_pes, \ |
| 59 property_lexical_handler, \ |
| 60 property_declaration_handler, \ |
| 61 property_dom_node, \ |
| 62 property_xml_string |
| 63 |
| 64 # libxml2 returns strings as UTF8 |
| 65 _decoder = codecs.lookup("utf8")[1] |
| 66 def _d(s): |
| 67 if s is None: |
| 68 return s |
| 69 else: |
| 70 return _decoder(s)[0] |
| 71 |
| 72 try: |
| 73 import libxml2 |
| 74 except ImportError: |
| 75 raise SAXReaderNotAvailable("libxml2 not available: " \ |
| 76 "import error was: %s" % sys.exc_info()[1]) |
| 77 |
| 78 class Locator(xmlreader.Locator): |
| 79 """SAX Locator adapter for libxml2.xmlTextReaderLocator""" |
| 80 |
| 81 def __init__(self,locator): |
| 82 self.__locator = locator |
| 83 |
| 84 def getColumnNumber(self): |
| 85 "Return the column number where the current event ends." |
| 86 return -1 |
| 87 |
| 88 def getLineNumber(self): |
| 89 "Return the line number where the current event ends." |
| 90 return self.__locator.LineNumber() |
| 91 |
| 92 def getPublicId(self): |
| 93 "Return the public identifier for the current event." |
| 94 return None |
| 95 |
| 96 def getSystemId(self): |
| 97 "Return the system identifier for the current event." |
| 98 return self.__locator.BaseURI() |
| 99 |
| 100 class LibXml2Reader(xmlreader.XMLReader): |
| 101 |
| 102 def __init__(self): |
| 103 xmlreader.XMLReader.__init__(self) |
| 104 # features |
| 105 self.__ns = 0 |
| 106 self.__nspfx = 0 |
| 107 self.__validate = 0 |
| 108 self.__extparams = 1 |
| 109 # parsing flag |
| 110 self.__parsing = 0 |
| 111 # additional handlers |
| 112 self.__lex_handler = None |
| 113 self.__decl_handler = None |
| 114 # error messages accumulator |
| 115 self.__errors = None |
| 116 |
| 117 def _errorHandler(self,arg,msg,severity,locator): |
| 118 if self.__errors is None: |
| 119 self.__errors = [] |
| 120 self.__errors.append((severity, |
| 121 SAXParseException(msg,None, |
| 122 Locator(locator)))) |
| 123 |
| 124 def _reportErrors(self,fatal): |
| 125 for severity,exception in self.__errors: |
| 126 if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING, |
| 127 libxml2.PARSER_SEVERITY_WARNING): |
| 128 self._err_handler.warning(exception) |
| 129 else: |
| 130 # when fatal is set, the parse will stop; |
| 131 # we consider that the last error reported |
| 132 # is the fatal one. |
| 133 if fatal and exception is self.__errors[-1][1]: |
| 134 self._err_handler.fatalError(exception) |
| 135 else: |
| 136 self._err_handler.error(exception) |
| 137 self.__errors = None |
| 138 |
| 139 def parse(self, source): |
| 140 self.__parsing = 1 |
| 141 try: |
| 142 # prepare source and create reader |
| 143 if isinstance(source, StringTypes): |
| 144 reader = libxml2.newTextReaderFilename(source) |
| 145 else: |
| 146 source = saxutils.prepare_input_source(source) |
| 147 input = libxml2.inputBuffer(source.getByteStream()) |
| 148 reader = input.newTextReader(source.getSystemId()) |
| 149 reader.SetErrorHandler(self._errorHandler,None) |
| 150 # configure reader |
| 151 if self.__extparams: |
| 152 reader.SetParserProp(libxml2.PARSER_LOADDTD,1) |
| 153 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1) |
| 154 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1) |
| 155 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate) |
| 156 else: |
| 157 reader.SetParserProp(libxml2.PARSER_LOADDTD, 0) |
| 158 # we reuse attribute maps (for a slight performance gain) |
| 159 if self.__ns: |
| 160 attributesNSImpl = xmlreader.AttributesNSImpl({},{}) |
| 161 else: |
| 162 attributesImpl = xmlreader.AttributesImpl({}) |
| 163 # prefixes to pop (for endPrefixMapping) |
| 164 prefixes = [] |
| 165 # start loop |
| 166 self._cont_handler.startDocument() |
| 167 while 1: |
| 168 r = reader.Read() |
| 169 # check for errors |
| 170 if r == 1: |
| 171 if not self.__errors is None: |
| 172 self._reportErrors(0) |
| 173 elif r == 0: |
| 174 if not self.__errors is None: |
| 175 self._reportErrors(0) |
| 176 break # end of parse |
| 177 else: |
| 178 if not self.__errors is None: |
| 179 self._reportErrors(1) |
| 180 else: |
| 181 self._err_handler.fatalError(\ |
| 182 SAXException("Read failed (no details available)")) |
| 183 break # fatal parse error |
| 184 # get node type |
| 185 nodeType = reader.NodeType() |
| 186 # Element |
| 187 if nodeType == 1: |
| 188 if self.__ns: |
| 189 eltName = (_d(reader.NamespaceUri()),\ |
| 190 _d(reader.LocalName())) |
| 191 eltQName = _d(reader.Name()) |
| 192 attributesNSImpl._attrs = attrs = {} |
| 193 attributesNSImpl._qnames = qnames = {} |
| 194 newPrefixes = [] |
| 195 while reader.MoveToNextAttribute(): |
| 196 qname = _d(reader.Name()) |
| 197 value = _d(reader.Value()) |
| 198 if qname.startswith("xmlns"): |
| 199 if len(qname) > 5: |
| 200 newPrefix = qname[6:] |
| 201 else: |
| 202 newPrefix = None |
| 203 newPrefixes.append(newPrefix) |
| 204 self._cont_handler.startPrefixMapping(\ |
| 205 newPrefix,value) |
| 206 if not self.__nspfx: |
| 207 continue # don't report xmlns attribute |
| 208 attName = (_d(reader.NamespaceUri()), |
| 209 _d(reader.LocalName())) |
| 210 qnames[attName] = qname |
| 211 attrs[attName] = value |
| 212 reader.MoveToElement() |
| 213 self._cont_handler.startElementNS( \ |
| 214 eltName,eltQName,attributesNSImpl) |
| 215 if reader.IsEmptyElement(): |
| 216 self._cont_handler.endElementNS(eltName,eltQName) |
| 217 for newPrefix in newPrefixes: |
| 218 self._cont_handler.endPrefixMapping(newPrefix) |
| 219 else: |
| 220 prefixes.append(newPrefixes) |
| 221 else: |
| 222 eltName = _d(reader.Name()) |
| 223 attributesImpl._attrs = attrs = {} |
| 224 while reader.MoveToNextAttribute(): |
| 225 attName = _d(reader.Name()) |
| 226 attrs[attName] = _d(reader.Value()) |
| 227 reader.MoveToElement() |
| 228 self._cont_handler.startElement( \ |
| 229 eltName,attributesImpl) |
| 230 if reader.IsEmptyElement(): |
| 231 self._cont_handler.endElement(eltName) |
| 232 # EndElement |
| 233 elif nodeType == 15: |
| 234 if self.__ns: |
| 235 self._cont_handler.endElementNS( \ |
| 236 (_d(reader.NamespaceUri()),_d(reader.LocalName())), |
| 237 _d(reader.Name())) |
| 238 for prefix in prefixes.pop(): |
| 239 self._cont_handler.endPrefixMapping(prefix) |
| 240 else: |
| 241 self._cont_handler.endElement(_d(reader.Name())) |
| 242 # Text |
| 243 elif nodeType == 3: |
| 244 self._cont_handler.characters(_d(reader.Value())) |
| 245 # Whitespace |
| 246 elif nodeType == 13: |
| 247 self._cont_handler.ignorableWhitespace(_d(reader.Value())) |
| 248 # SignificantWhitespace |
| 249 elif nodeType == 14: |
| 250 self._cont_handler.characters(_d(reader.Value())) |
| 251 # CDATA |
| 252 elif nodeType == 4: |
| 253 if not self.__lex_handler is None: |
| 254 self.__lex_handler.startCDATA() |
| 255 self._cont_handler.characters(_d(reader.Value())) |
| 256 if not self.__lex_handler is None: |
| 257 self.__lex_handler.endCDATA() |
| 258 # EntityReference |
| 259 elif nodeType == 5: |
| 260 if not self.__lex_handler is None: |
| 261 self.startEntity(_d(reader.Name())) |
| 262 reader.ResolveEntity() |
| 263 # EndEntity |
| 264 elif nodeType == 16: |
| 265 if not self.__lex_handler is None: |
| 266 self.endEntity(_d(reader.Name())) |
| 267 # ProcessingInstruction |
| 268 elif nodeType == 7: |
| 269 self._cont_handler.processingInstruction( \ |
| 270 _d(reader.Name()),_d(reader.Value())) |
| 271 # Comment |
| 272 elif nodeType == 8: |
| 273 if not self.__lex_handler is None: |
| 274 self.__lex_handler.comment(_d(reader.Value())) |
| 275 # DocumentType |
| 276 elif nodeType == 10: |
| 277 #if not self.__lex_handler is None: |
| 278 # self.__lex_handler.startDTD() |
| 279 pass # TODO (how to detect endDTD? on first non-dtd event?) |
| 280 # XmlDeclaration |
| 281 elif nodeType == 17: |
| 282 pass # TODO |
| 283 # Entity |
| 284 elif nodeType == 6: |
| 285 pass # TODO (entity decl) |
| 286 # Notation (decl) |
| 287 elif nodeType == 12: |
| 288 pass # TODO |
| 289 # Attribute (never in this loop) |
| 290 #elif nodeType == 2: |
| 291 # pass |
| 292 # Document (not exposed) |
| 293 #elif nodeType == 9: |
| 294 # pass |
| 295 # DocumentFragment (never returned by XmlReader) |
| 296 #elif nodeType == 11: |
| 297 # pass |
| 298 # None |
| 299 #elif nodeType == 0: |
| 300 # pass |
| 301 # - |
| 302 else: |
| 303 raise SAXException("Unexpected node type %d" % nodeType) |
| 304 if r == 0: |
| 305 self._cont_handler.endDocument() |
| 306 reader.Close() |
| 307 finally: |
| 308 self.__parsing = 0 |
| 309 |
| 310 def setDTDHandler(self, handler): |
| 311 # TODO (when supported, the inherited method works just fine) |
| 312 raise SAXNotSupportedException("DTDHandler not supported") |
| 313 |
| 314 def setEntityResolver(self, resolver): |
| 315 # TODO (when supported, the inherited method works just fine) |
| 316 raise SAXNotSupportedException("EntityResolver not supported") |
| 317 |
| 318 def getFeature(self, name): |
| 319 if name == feature_namespaces: |
| 320 return self.__ns |
| 321 elif name == feature_namespace_prefixes: |
| 322 return self.__nspfx |
| 323 elif name == feature_validation: |
| 324 return self.__validate |
| 325 elif name == feature_external_ges: |
| 326 return 1 # TODO (does that relate to PARSER_LOADDTD)? |
| 327 elif name == feature_external_pes: |
| 328 return self.__extparams |
| 329 else: |
| 330 raise SAXNotRecognizedException("Feature '%s' not recognized" % \ |
| 331 name) |
| 332 |
| 333 def setFeature(self, name, state): |
| 334 if self.__parsing: |
| 335 raise SAXNotSupportedException("Cannot set feature %s " \ |
| 336 "while parsing" % name) |
| 337 if name == feature_namespaces: |
| 338 self.__ns = state |
| 339 elif name == feature_namespace_prefixes: |
| 340 self.__nspfx = state |
| 341 elif name == feature_validation: |
| 342 self.__validate = state |
| 343 elif name == feature_external_ges: |
| 344 if state == 0: |
| 345 # TODO (does that relate to PARSER_LOADDTD)? |
| 346 raise SAXNotSupportedException("Feature '%s' not supported" % \ |
| 347 name) |
| 348 elif name == feature_external_pes: |
| 349 self.__extparams = state |
| 350 else: |
| 351 raise SAXNotRecognizedException("Feature '%s' not recognized" % \ |
| 352 name) |
| 353 |
| 354 def getProperty(self, name): |
| 355 if name == property_lexical_handler: |
| 356 return self.__lex_handler |
| 357 elif name == property_declaration_handler: |
| 358 return self.__decl_handler |
| 359 else: |
| 360 raise SAXNotRecognizedException("Property '%s' not recognized" % \ |
| 361 name) |
| 362 |
| 363 def setProperty(self, name, value): |
| 364 if name == property_lexical_handler: |
| 365 self.__lex_handler = value |
| 366 elif name == property_declaration_handler: |
| 367 # TODO: remove if/when libxml2 supports dtd events |
| 368 raise SAXNotSupportedException("Property '%s' not supported" % \ |
| 369 name) |
| 370 self.__decl_handler = value |
| 371 else: |
| 372 raise SAXNotRecognizedException("Property '%s' not recognized" % \ |
| 373 name) |
| 374 |
| 375 def create_parser(): |
| 376 return LibXml2Reader() |
| 377 |
OLD | NEW |