| Index: third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/treebuilders/etree_lxml.py
|
| diff --git a/third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/treebuilders/etree_lxml.py b/third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/treebuilders/etree_lxml.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..35d08efaa6145719f8daad533d03df6188e2d2e4
|
| --- /dev/null
|
| +++ b/third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/treebuilders/etree_lxml.py
|
| @@ -0,0 +1,369 @@
|
| +"""Module for supporting the lxml.etree library. The idea here is to use as much
|
| +of the native library as possible, without using fragile hacks like custom element
|
| +names that break between releases. The downside of this is that we cannot represent
|
| +all possible trees; specifically the following are known to cause problems:
|
| +
|
| +Text or comments as siblings of the root element
|
| +Docypes with no name
|
| +
|
| +When any of these things occur, we emit a DataLossWarning
|
| +"""
|
| +
|
| +from __future__ import absolute_import, division, unicode_literals
|
| +
|
| +import warnings
|
| +import re
|
| +import sys
|
| +
|
| +from . import _base
|
| +from ..constants import DataLossWarning
|
| +from .. import constants
|
| +from . import etree as etree_builders
|
| +from .. import ihatexml
|
| +
|
| +import lxml.etree as etree
|
| +
|
| +
|
| +fullTree = True
|
| +tag_regexp = re.compile("{([^}]*)}(.*)")
|
| +
|
| +comment_type = etree.Comment("asd").tag
|
| +
|
| +
|
| +class DocumentType(object):
|
| + def __init__(self, name, publicId, systemId):
|
| + self.name = name
|
| + self.publicId = publicId
|
| + self.systemId = systemId
|
| +
|
| +
|
| +class Document(object):
|
| + def __init__(self):
|
| + self._elementTree = None
|
| + self._childNodes = []
|
| +
|
| + def appendChild(self, element):
|
| + self._elementTree.getroot().addnext(element._element)
|
| +
|
| + def _getChildNodes(self):
|
| + return self._childNodes
|
| +
|
| + childNodes = property(_getChildNodes)
|
| +
|
| +
|
| +def testSerializer(element):
|
| + rv = []
|
| + finalText = None
|
| + infosetFilter = ihatexml.InfosetFilter()
|
| +
|
| + def serializeElement(element, indent=0):
|
| + if not hasattr(element, "tag"):
|
| + if hasattr(element, "getroot"):
|
| + # Full tree case
|
| + rv.append("#document")
|
| + if element.docinfo.internalDTD:
|
| + if not (element.docinfo.public_id or
|
| + element.docinfo.system_url):
|
| + dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
|
| + else:
|
| + dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
|
| + element.docinfo.root_name,
|
| + element.docinfo.public_id,
|
| + element.docinfo.system_url)
|
| + rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
|
| + next_element = element.getroot()
|
| + while next_element.getprevious() is not None:
|
| + next_element = next_element.getprevious()
|
| + while next_element is not None:
|
| + serializeElement(next_element, indent + 2)
|
| + next_element = next_element.getnext()
|
| + elif isinstance(element, str) or isinstance(element, bytes):
|
| + # Text in a fragment
|
| + assert isinstance(element, str) or sys.version_info.major == 2
|
| + rv.append("|%s\"%s\"" % (' ' * indent, element))
|
| + else:
|
| + # Fragment case
|
| + rv.append("#document-fragment")
|
| + for next_element in element:
|
| + serializeElement(next_element, indent + 2)
|
| + elif element.tag == comment_type:
|
| + rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
|
| + if hasattr(element, "tail") and element.tail:
|
| + rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
|
| + else:
|
| + assert isinstance(element, etree._Element)
|
| + nsmatch = etree_builders.tag_regexp.match(element.tag)
|
| + if nsmatch is not None:
|
| + ns = nsmatch.group(1)
|
| + tag = nsmatch.group(2)
|
| + prefix = constants.prefixes[ns]
|
| + rv.append("|%s<%s %s>" % (' ' * indent, prefix,
|
| + infosetFilter.fromXmlName(tag)))
|
| + else:
|
| + rv.append("|%s<%s>" % (' ' * indent,
|
| + infosetFilter.fromXmlName(element.tag)))
|
| +
|
| + if hasattr(element, "attrib"):
|
| + attributes = []
|
| + for name, value in element.attrib.items():
|
| + nsmatch = tag_regexp.match(name)
|
| + if nsmatch is not None:
|
| + ns, name = nsmatch.groups()
|
| + name = infosetFilter.fromXmlName(name)
|
| + prefix = constants.prefixes[ns]
|
| + attr_string = "%s %s" % (prefix, name)
|
| + else:
|
| + attr_string = infosetFilter.fromXmlName(name)
|
| + attributes.append((attr_string, value))
|
| +
|
| + for name, value in sorted(attributes):
|
| + rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
| +
|
| + if element.text:
|
| + rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
| + indent += 2
|
| + for child in element:
|
| + serializeElement(child, indent)
|
| + if hasattr(element, "tail") and element.tail:
|
| + rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
|
| + serializeElement(element, 0)
|
| +
|
| + if finalText is not None:
|
| + rv.append("|%s\"%s\"" % (' ' * 2, finalText))
|
| +
|
| + return "\n".join(rv)
|
| +
|
| +
|
| +def tostring(element):
|
| + """Serialize an element and its child nodes to a string"""
|
| + rv = []
|
| + finalText = None
|
| +
|
| + def serializeElement(element):
|
| + if not hasattr(element, "tag"):
|
| + if element.docinfo.internalDTD:
|
| + if element.docinfo.doctype:
|
| + dtd_str = element.docinfo.doctype
|
| + else:
|
| + dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
|
| + rv.append(dtd_str)
|
| + serializeElement(element.getroot())
|
| +
|
| + elif element.tag == comment_type:
|
| + rv.append("<!--%s-->" % (element.text,))
|
| +
|
| + else:
|
| + # This is assumed to be an ordinary element
|
| + if not element.attrib:
|
| + rv.append("<%s>" % (element.tag,))
|
| + else:
|
| + attr = " ".join(["%s=\"%s\"" % (name, value)
|
| + for name, value in element.attrib.items()])
|
| + rv.append("<%s %s>" % (element.tag, attr))
|
| + if element.text:
|
| + rv.append(element.text)
|
| +
|
| + for child in element:
|
| + serializeElement(child)
|
| +
|
| + rv.append("</%s>" % (element.tag,))
|
| +
|
| + if hasattr(element, "tail") and element.tail:
|
| + rv.append(element.tail)
|
| +
|
| + serializeElement(element)
|
| +
|
| + if finalText is not None:
|
| + rv.append("%s\"" % (' ' * 2, finalText))
|
| +
|
| + return "".join(rv)
|
| +
|
| +
|
| +class TreeBuilder(_base.TreeBuilder):
|
| + documentClass = Document
|
| + doctypeClass = DocumentType
|
| + elementClass = None
|
| + commentClass = None
|
| + fragmentClass = Document
|
| + implementation = etree
|
| +
|
| + def __init__(self, namespaceHTMLElements, fullTree=False):
|
| + builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
|
| + infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
|
| + self.namespaceHTMLElements = namespaceHTMLElements
|
| +
|
| + class Attributes(dict):
|
| + def __init__(self, element, value={}):
|
| + self._element = element
|
| + dict.__init__(self, value)
|
| + for key, value in self.items():
|
| + if isinstance(key, tuple):
|
| + name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
|
| + else:
|
| + name = infosetFilter.coerceAttribute(key)
|
| + self._element._element.attrib[name] = value
|
| +
|
| + def __setitem__(self, key, value):
|
| + dict.__setitem__(self, key, value)
|
| + if isinstance(key, tuple):
|
| + name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
|
| + else:
|
| + name = infosetFilter.coerceAttribute(key)
|
| + self._element._element.attrib[name] = value
|
| +
|
| + class Element(builder.Element):
|
| + def __init__(self, name, namespace):
|
| + name = infosetFilter.coerceElement(name)
|
| + builder.Element.__init__(self, name, namespace=namespace)
|
| + self._attributes = Attributes(self)
|
| +
|
| + def _setName(self, name):
|
| + self._name = infosetFilter.coerceElement(name)
|
| + self._element.tag = self._getETreeTag(
|
| + self._name, self._namespace)
|
| +
|
| + def _getName(self):
|
| + return infosetFilter.fromXmlName(self._name)
|
| +
|
| + name = property(_getName, _setName)
|
| +
|
| + def _getAttributes(self):
|
| + return self._attributes
|
| +
|
| + def _setAttributes(self, attributes):
|
| + self._attributes = Attributes(self, attributes)
|
| +
|
| + attributes = property(_getAttributes, _setAttributes)
|
| +
|
| + def insertText(self, data, insertBefore=None):
|
| + data = infosetFilter.coerceCharacters(data)
|
| + builder.Element.insertText(self, data, insertBefore)
|
| +
|
| + def appendChild(self, child):
|
| + builder.Element.appendChild(self, child)
|
| +
|
| + class Comment(builder.Comment):
|
| + def __init__(self, data):
|
| + data = infosetFilter.coerceComment(data)
|
| + builder.Comment.__init__(self, data)
|
| +
|
| + def _setData(self, data):
|
| + data = infosetFilter.coerceComment(data)
|
| + self._element.text = data
|
| +
|
| + def _getData(self):
|
| + return self._element.text
|
| +
|
| + data = property(_getData, _setData)
|
| +
|
| + self.elementClass = Element
|
| + self.commentClass = builder.Comment
|
| + # self.fragmentClass = builder.DocumentFragment
|
| + _base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
| +
|
| + def reset(self):
|
| + _base.TreeBuilder.reset(self)
|
| + self.insertComment = self.insertCommentInitial
|
| + self.initial_comments = []
|
| + self.doctype = None
|
| +
|
| + def testSerializer(self, element):
|
| + return testSerializer(element)
|
| +
|
| + def getDocument(self):
|
| + if fullTree:
|
| + return self.document._elementTree
|
| + else:
|
| + return self.document._elementTree.getroot()
|
| +
|
| + def getFragment(self):
|
| + fragment = []
|
| + element = self.openElements[0]._element
|
| + if element.text:
|
| + fragment.append(element.text)
|
| + fragment.extend(list(element))
|
| + if element.tail:
|
| + fragment.append(element.tail)
|
| + return fragment
|
| +
|
| + def insertDoctype(self, token):
|
| + name = token["name"]
|
| + publicId = token["publicId"]
|
| + systemId = token["systemId"]
|
| +
|
| + if not name:
|
| + warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
|
| + self.doctype = None
|
| + else:
|
| + coercedName = self.infosetFilter.coerceElement(name)
|
| + if coercedName != name:
|
| + warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
|
| +
|
| + doctype = self.doctypeClass(coercedName, publicId, systemId)
|
| + self.doctype = doctype
|
| +
|
| + def insertCommentInitial(self, data, parent=None):
|
| + self.initial_comments.append(data)
|
| +
|
| + def insertCommentMain(self, data, parent=None):
|
| + if (parent == self.document and
|
| + self.document._elementTree.getroot()[-1].tag == comment_type):
|
| + warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
|
| + super(TreeBuilder, self).insertComment(data, parent)
|
| +
|
| + def insertRoot(self, token):
|
| + """Create the document root"""
|
| + # Because of the way libxml2 works, it doesn't seem to be possible to
|
| + # alter information like the doctype after the tree has been parsed.
|
| + # Therefore we need to use the built-in parser to create our iniial
|
| + # tree, after which we can add elements like normal
|
| + docStr = ""
|
| + if self.doctype:
|
| + assert self.doctype.name
|
| + docStr += "<!DOCTYPE %s" % self.doctype.name
|
| + if (self.doctype.publicId is not None or
|
| + self.doctype.systemId is not None):
|
| + docStr += (' PUBLIC "%s" ' %
|
| + (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
|
| + if self.doctype.systemId:
|
| + sysid = self.doctype.systemId
|
| + if sysid.find("'") >= 0 and sysid.find('"') >= 0:
|
| + warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
|
| + sysid = sysid.replace("'", 'U00027')
|
| + if sysid.find("'") >= 0:
|
| + docStr += '"%s"' % sysid
|
| + else:
|
| + docStr += "'%s'" % sysid
|
| + else:
|
| + docStr += "''"
|
| + docStr += ">"
|
| + if self.doctype.name != token["name"]:
|
| + warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
|
| + docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
|
| + root = etree.fromstring(docStr)
|
| +
|
| + # Append the initial comments:
|
| + for comment_token in self.initial_comments:
|
| + root.addprevious(etree.Comment(comment_token["data"]))
|
| +
|
| + # Create the root document and add the ElementTree to it
|
| + self.document = self.documentClass()
|
| + self.document._elementTree = root.getroottree()
|
| +
|
| + # Give the root element the right name
|
| + name = token["name"]
|
| + namespace = token.get("namespace", self.defaultNamespace)
|
| + if namespace is None:
|
| + etree_tag = name
|
| + else:
|
| + etree_tag = "{%s}%s" % (namespace, name)
|
| + root.tag = etree_tag
|
| +
|
| + # Add the root element to the internal child/open data structures
|
| + root_element = self.elementClass(name, namespace)
|
| + root_element._element = root
|
| + self.document._childNodes.append(root_element)
|
| + self.openElements.append(root_element)
|
| +
|
| + # Reset to the default insert comment function
|
| + self.insertComment = self.insertCommentMain
|
|
|