Index: third_party/libxml/src/python/drv_libxml2.py |
diff --git a/third_party/libxml/src/python/drv_libxml2.py b/third_party/libxml/src/python/drv_libxml2.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..c2ec3e27fb89796e753784488c35b7c6029c5862 |
--- /dev/null |
+++ b/third_party/libxml/src/python/drv_libxml2.py |
@@ -0,0 +1,377 @@ |
+# -*- coding: iso-8859-1 -*- |
+""" A SAX2 driver for libxml2, on top of it's XmlReader API |
+ |
+USAGE |
+ # put this file (drv_libxml2.py) in PYTHONPATH |
+ import xml.sax |
+ reader = xml.sax.make_parser(["drv_libxml2"]) |
+ # ...and the rest is standard python sax. |
+ |
+CAVEATS |
+ - Lexical handlers are supported, except for start/endEntity |
+ (waiting for XmlReader.ResolveEntity) and start/endDTD |
+ - Error callbacks are not exactly synchronous, they tend |
+ to be invoked before the corresponding content callback, |
+ because the underlying reader interface parses |
+ data by chunks of 512 bytes |
+ |
+TODO |
+ - search for TODO |
+ - some ErrorHandler events (warning) |
+ - some ContentHandler events (setDocumentLocator, skippedEntity) |
+ - EntityResolver (using libxml2.?) |
+ - DTDHandler (if/when libxml2 exposes such node types) |
+ - DeclHandler (if/when libxml2 exposes such node types) |
+ - property_xml_string? |
+ - feature_string_interning? |
+ - Incremental parser |
+ - additional performance tuning: |
+ - one might cache callbacks to avoid some name lookups |
+ - one might implement a smarter way to pass attributes to startElement |
+ (some kind of lazy evaluation?) |
+ - there might be room for improvement in start/endPrefixMapping |
+ - other? |
+ |
+""" |
+ |
+__author__ = "Stéphane Bidoul <sbi@skynet.be>" |
+__version__ = "0.3" |
+ |
+import sys |
+import codecs |
+ |
+if sys.version_info[0] < 3: |
+ __author__ = codecs.unicode_escape_decode(__author__)[0] |
+ |
+ StringTypes = (str, unicode) |
+else: |
+ StringTypes = str |
+ |
+from xml.sax._exceptions import * |
+from xml.sax import xmlreader, saxutils |
+from xml.sax.handler import \ |
+ feature_namespaces, \ |
+ feature_namespace_prefixes, \ |
+ feature_string_interning, \ |
+ feature_validation, \ |
+ feature_external_ges, \ |
+ feature_external_pes, \ |
+ property_lexical_handler, \ |
+ property_declaration_handler, \ |
+ property_dom_node, \ |
+ property_xml_string |
+ |
+# libxml2 returns strings as UTF8 |
+_decoder = codecs.lookup("utf8")[1] |
+def _d(s): |
+ if s is None: |
+ return s |
+ else: |
+ return _decoder(s)[0] |
+ |
+try: |
+ import libxml2 |
+except ImportError: |
+ raise SAXReaderNotAvailable("libxml2 not available: " \ |
+ "import error was: %s" % sys.exc_info()[1]) |
+ |
+class Locator(xmlreader.Locator): |
+ """SAX Locator adapter for libxml2.xmlTextReaderLocator""" |
+ |
+ def __init__(self,locator): |
+ self.__locator = locator |
+ |
+ def getColumnNumber(self): |
+ "Return the column number where the current event ends." |
+ return -1 |
+ |
+ def getLineNumber(self): |
+ "Return the line number where the current event ends." |
+ return self.__locator.LineNumber() |
+ |
+ def getPublicId(self): |
+ "Return the public identifier for the current event." |
+ return None |
+ |
+ def getSystemId(self): |
+ "Return the system identifier for the current event." |
+ return self.__locator.BaseURI() |
+ |
+class LibXml2Reader(xmlreader.XMLReader): |
+ |
+ def __init__(self): |
+ xmlreader.XMLReader.__init__(self) |
+ # features |
+ self.__ns = 0 |
+ self.__nspfx = 0 |
+ self.__validate = 0 |
+ self.__extparams = 1 |
+ # parsing flag |
+ self.__parsing = 0 |
+ # additional handlers |
+ self.__lex_handler = None |
+ self.__decl_handler = None |
+ # error messages accumulator |
+ self.__errors = None |
+ |
+ def _errorHandler(self,arg,msg,severity,locator): |
+ if self.__errors is None: |
+ self.__errors = [] |
+ self.__errors.append((severity, |
+ SAXParseException(msg,None, |
+ Locator(locator)))) |
+ |
+ def _reportErrors(self,fatal): |
+ for severity,exception in self.__errors: |
+ if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING, |
+ libxml2.PARSER_SEVERITY_WARNING): |
+ self._err_handler.warning(exception) |
+ else: |
+ # when fatal is set, the parse will stop; |
+ # we consider that the last error reported |
+ # is the fatal one. |
+ if fatal and exception is self.__errors[-1][1]: |
+ self._err_handler.fatalError(exception) |
+ else: |
+ self._err_handler.error(exception) |
+ self.__errors = None |
+ |
+ def parse(self, source): |
+ self.__parsing = 1 |
+ try: |
+ # prepare source and create reader |
+ if isinstance(source, StringTypes): |
+ reader = libxml2.newTextReaderFilename(source) |
+ else: |
+ source = saxutils.prepare_input_source(source) |
+ input = libxml2.inputBuffer(source.getByteStream()) |
+ reader = input.newTextReader(source.getSystemId()) |
+ reader.SetErrorHandler(self._errorHandler,None) |
+ # configure reader |
+ if self.__extparams: |
+ reader.SetParserProp(libxml2.PARSER_LOADDTD,1) |
+ reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1) |
+ reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1) |
+ reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate) |
+ else: |
+ reader.SetParserProp(libxml2.PARSER_LOADDTD, 0) |
+ # we reuse attribute maps (for a slight performance gain) |
+ if self.__ns: |
+ attributesNSImpl = xmlreader.AttributesNSImpl({},{}) |
+ else: |
+ attributesImpl = xmlreader.AttributesImpl({}) |
+ # prefixes to pop (for endPrefixMapping) |
+ prefixes = [] |
+ # start loop |
+ self._cont_handler.startDocument() |
+ while 1: |
+ r = reader.Read() |
+ # check for errors |
+ if r == 1: |
+ if not self.__errors is None: |
+ self._reportErrors(0) |
+ elif r == 0: |
+ if not self.__errors is None: |
+ self._reportErrors(0) |
+ break # end of parse |
+ else: |
+ if not self.__errors is None: |
+ self._reportErrors(1) |
+ else: |
+ self._err_handler.fatalError(\ |
+ SAXException("Read failed (no details available)")) |
+ break # fatal parse error |
+ # get node type |
+ nodeType = reader.NodeType() |
+ # Element |
+ if nodeType == 1: |
+ if self.__ns: |
+ eltName = (_d(reader.NamespaceUri()),\ |
+ _d(reader.LocalName())) |
+ eltQName = _d(reader.Name()) |
+ attributesNSImpl._attrs = attrs = {} |
+ attributesNSImpl._qnames = qnames = {} |
+ newPrefixes = [] |
+ while reader.MoveToNextAttribute(): |
+ qname = _d(reader.Name()) |
+ value = _d(reader.Value()) |
+ if qname.startswith("xmlns"): |
+ if len(qname) > 5: |
+ newPrefix = qname[6:] |
+ else: |
+ newPrefix = None |
+ newPrefixes.append(newPrefix) |
+ self._cont_handler.startPrefixMapping(\ |
+ newPrefix,value) |
+ if not self.__nspfx: |
+ continue # don't report xmlns attribute |
+ attName = (_d(reader.NamespaceUri()), |
+ _d(reader.LocalName())) |
+ qnames[attName] = qname |
+ attrs[attName] = value |
+ reader.MoveToElement() |
+ self._cont_handler.startElementNS( \ |
+ eltName,eltQName,attributesNSImpl) |
+ if reader.IsEmptyElement(): |
+ self._cont_handler.endElementNS(eltName,eltQName) |
+ for newPrefix in newPrefixes: |
+ self._cont_handler.endPrefixMapping(newPrefix) |
+ else: |
+ prefixes.append(newPrefixes) |
+ else: |
+ eltName = _d(reader.Name()) |
+ attributesImpl._attrs = attrs = {} |
+ while reader.MoveToNextAttribute(): |
+ attName = _d(reader.Name()) |
+ attrs[attName] = _d(reader.Value()) |
+ reader.MoveToElement() |
+ self._cont_handler.startElement( \ |
+ eltName,attributesImpl) |
+ if reader.IsEmptyElement(): |
+ self._cont_handler.endElement(eltName) |
+ # EndElement |
+ elif nodeType == 15: |
+ if self.__ns: |
+ self._cont_handler.endElementNS( \ |
+ (_d(reader.NamespaceUri()),_d(reader.LocalName())), |
+ _d(reader.Name())) |
+ for prefix in prefixes.pop(): |
+ self._cont_handler.endPrefixMapping(prefix) |
+ else: |
+ self._cont_handler.endElement(_d(reader.Name())) |
+ # Text |
+ elif nodeType == 3: |
+ self._cont_handler.characters(_d(reader.Value())) |
+ # Whitespace |
+ elif nodeType == 13: |
+ self._cont_handler.ignorableWhitespace(_d(reader.Value())) |
+ # SignificantWhitespace |
+ elif nodeType == 14: |
+ self._cont_handler.characters(_d(reader.Value())) |
+ # CDATA |
+ elif nodeType == 4: |
+ if not self.__lex_handler is None: |
+ self.__lex_handler.startCDATA() |
+ self._cont_handler.characters(_d(reader.Value())) |
+ if not self.__lex_handler is None: |
+ self.__lex_handler.endCDATA() |
+ # EntityReference |
+ elif nodeType == 5: |
+ if not self.__lex_handler is None: |
+ self.startEntity(_d(reader.Name())) |
+ reader.ResolveEntity() |
+ # EndEntity |
+ elif nodeType == 16: |
+ if not self.__lex_handler is None: |
+ self.endEntity(_d(reader.Name())) |
+ # ProcessingInstruction |
+ elif nodeType == 7: |
+ self._cont_handler.processingInstruction( \ |
+ _d(reader.Name()),_d(reader.Value())) |
+ # Comment |
+ elif nodeType == 8: |
+ if not self.__lex_handler is None: |
+ self.__lex_handler.comment(_d(reader.Value())) |
+ # DocumentType |
+ elif nodeType == 10: |
+ #if not self.__lex_handler is None: |
+ # self.__lex_handler.startDTD() |
+ pass # TODO (how to detect endDTD? on first non-dtd event?) |
+ # XmlDeclaration |
+ elif nodeType == 17: |
+ pass # TODO |
+ # Entity |
+ elif nodeType == 6: |
+ pass # TODO (entity decl) |
+ # Notation (decl) |
+ elif nodeType == 12: |
+ pass # TODO |
+ # Attribute (never in this loop) |
+ #elif nodeType == 2: |
+ # pass |
+ # Document (not exposed) |
+ #elif nodeType == 9: |
+ # pass |
+ # DocumentFragment (never returned by XmlReader) |
+ #elif nodeType == 11: |
+ # pass |
+ # None |
+ #elif nodeType == 0: |
+ # pass |
+ # - |
+ else: |
+ raise SAXException("Unexpected node type %d" % nodeType) |
+ if r == 0: |
+ self._cont_handler.endDocument() |
+ reader.Close() |
+ finally: |
+ self.__parsing = 0 |
+ |
+ def setDTDHandler(self, handler): |
+ # TODO (when supported, the inherited method works just fine) |
+ raise SAXNotSupportedException("DTDHandler not supported") |
+ |
+ def setEntityResolver(self, resolver): |
+ # TODO (when supported, the inherited method works just fine) |
+ raise SAXNotSupportedException("EntityResolver not supported") |
+ |
+ def getFeature(self, name): |
+ if name == feature_namespaces: |
+ return self.__ns |
+ elif name == feature_namespace_prefixes: |
+ return self.__nspfx |
+ elif name == feature_validation: |
+ return self.__validate |
+ elif name == feature_external_ges: |
+ return 1 # TODO (does that relate to PARSER_LOADDTD)? |
+ elif name == feature_external_pes: |
+ return self.__extparams |
+ else: |
+ raise SAXNotRecognizedException("Feature '%s' not recognized" % \ |
+ name) |
+ |
+ def setFeature(self, name, state): |
+ if self.__parsing: |
+ raise SAXNotSupportedException("Cannot set feature %s " \ |
+ "while parsing" % name) |
+ if name == feature_namespaces: |
+ self.__ns = state |
+ elif name == feature_namespace_prefixes: |
+ self.__nspfx = state |
+ elif name == feature_validation: |
+ self.__validate = state |
+ elif name == feature_external_ges: |
+ if state == 0: |
+ # TODO (does that relate to PARSER_LOADDTD)? |
+ raise SAXNotSupportedException("Feature '%s' not supported" % \ |
+ name) |
+ elif name == feature_external_pes: |
+ self.__extparams = state |
+ else: |
+ raise SAXNotRecognizedException("Feature '%s' not recognized" % \ |
+ name) |
+ |
+ def getProperty(self, name): |
+ if name == property_lexical_handler: |
+ return self.__lex_handler |
+ elif name == property_declaration_handler: |
+ return self.__decl_handler |
+ else: |
+ raise SAXNotRecognizedException("Property '%s' not recognized" % \ |
+ name) |
+ |
+ def setProperty(self, name, value): |
+ if name == property_lexical_handler: |
+ self.__lex_handler = value |
+ elif name == property_declaration_handler: |
+ # TODO: remove if/when libxml2 supports dtd events |
+ raise SAXNotSupportedException("Property '%s' not supported" % \ |
+ name) |
+ self.__decl_handler = value |
+ else: |
+ raise SAXNotRecognizedException("Property '%s' not recognized" % \ |
+ name) |
+ |
+def create_parser(): |
+ return LibXml2Reader() |
+ |