| Index: third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/serializer/htmlserializer.py
|
| diff --git a/third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/serializer/htmlserializer.py b/third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/serializer/htmlserializer.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..4a891ff56c4eed1293d2763981584c1dbc4ab25a
|
| --- /dev/null
|
| +++ b/third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/serializer/htmlserializer.py
|
| @@ -0,0 +1,320 @@
|
| +from __future__ import absolute_import, division, unicode_literals
|
| +from six import text_type
|
| +
|
| +import gettext
|
| +_ = gettext.gettext
|
| +
|
| +try:
|
| + from functools import reduce
|
| +except ImportError:
|
| + pass
|
| +
|
| +from ..constants import voidElements, booleanAttributes, spaceCharacters
|
| +from ..constants import rcdataElements, entities, xmlEntities
|
| +from .. import utils
|
| +from xml.sax.saxutils import escape
|
| +
|
| +spaceCharacters = "".join(spaceCharacters)
|
| +
|
| +try:
|
| + from codecs import register_error, xmlcharrefreplace_errors
|
| +except ImportError:
|
| + unicode_encode_errors = "strict"
|
| +else:
|
| + unicode_encode_errors = "htmlentityreplace"
|
| +
|
| + encode_entity_map = {}
|
| + is_ucs4 = len("\U0010FFFF") == 1
|
| + for k, v in list(entities.items()):
|
| + # skip multi-character entities
|
| + if ((is_ucs4 and len(v) > 1) or
|
| + (not is_ucs4 and len(v) > 2)):
|
| + continue
|
| + if v != "&":
|
| + if len(v) == 2:
|
| + v = utils.surrogatePairToCodepoint(v)
|
| + else:
|
| + v = ord(v)
|
| + if v not in encode_entity_map or k.islower():
|
| + # prefer < over < and similarly for &, >, etc.
|
| + encode_entity_map[v] = k
|
| +
|
| + def htmlentityreplace_errors(exc):
|
| + if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
| + res = []
|
| + codepoints = []
|
| + skip = False
|
| + for i, c in enumerate(exc.object[exc.start:exc.end]):
|
| + if skip:
|
| + skip = False
|
| + continue
|
| + index = i + exc.start
|
| + if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
|
| + codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
|
| + skip = True
|
| + else:
|
| + codepoint = ord(c)
|
| + codepoints.append(codepoint)
|
| + for cp in codepoints:
|
| + e = encode_entity_map.get(cp)
|
| + if e:
|
| + res.append("&")
|
| + res.append(e)
|
| + if not e.endswith(";"):
|
| + res.append(";")
|
| + else:
|
| + res.append("&#x%s;" % (hex(cp)[2:]))
|
| + return ("".join(res), exc.end)
|
| + else:
|
| + return xmlcharrefreplace_errors(exc)
|
| +
|
| + register_error(unicode_encode_errors, htmlentityreplace_errors)
|
| +
|
| + del register_error
|
| +
|
| +
|
| +class HTMLSerializer(object):
|
| +
|
| + # attribute quoting options
|
| + quote_attr_values = False
|
| + quote_char = '"'
|
| + use_best_quote_char = True
|
| +
|
| + # tag syntax options
|
| + omit_optional_tags = True
|
| + minimize_boolean_attributes = True
|
| + use_trailing_solidus = False
|
| + space_before_trailing_solidus = True
|
| +
|
| + # escaping options
|
| + escape_lt_in_attrs = False
|
| + escape_rcdata = False
|
| + resolve_entities = True
|
| +
|
| + # miscellaneous options
|
| + alphabetical_attributes = False
|
| + inject_meta_charset = True
|
| + strip_whitespace = False
|
| + sanitize = False
|
| +
|
| + options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
| + "omit_optional_tags", "minimize_boolean_attributes",
|
| + "use_trailing_solidus", "space_before_trailing_solidus",
|
| + "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
|
| + "alphabetical_attributes", "inject_meta_charset",
|
| + "strip_whitespace", "sanitize")
|
| +
|
| + def __init__(self, **kwargs):
|
| + """Initialize HTMLSerializer.
|
| +
|
| + Keyword options (default given first unless specified) include:
|
| +
|
| + inject_meta_charset=True|False
|
| + Whether it insert a meta element to define the character set of the
|
| + document.
|
| + quote_attr_values=True|False
|
| + Whether to quote attribute values that don't require quoting
|
| + per HTML5 parsing rules.
|
| + quote_char=u'"'|u"'"
|
| + Use given quote character for attribute quoting. Default is to
|
| + use double quote unless attribute value contains a double quote,
|
| + in which case single quotes are used instead.
|
| + escape_lt_in_attrs=False|True
|
| + Whether to escape < in attribute values.
|
| + escape_rcdata=False|True
|
| + Whether to escape characters that need to be escaped within normal
|
| + elements within rcdata elements such as style.
|
| + resolve_entities=True|False
|
| + Whether to resolve named character entities that appear in the
|
| + source tree. The XML predefined entities < > & " '
|
| + are unaffected by this setting.
|
| + strip_whitespace=False|True
|
| + Whether to remove semantically meaningless whitespace. (This
|
| + compresses all whitespace to a single space except within pre.)
|
| + minimize_boolean_attributes=True|False
|
| + Shortens boolean attributes to give just the attribute value,
|
| + for example <input disabled="disabled"> becomes <input disabled>.
|
| + use_trailing_solidus=False|True
|
| + Includes a close-tag slash at the end of the start tag of void
|
| + elements (empty elements whose end tag is forbidden). E.g. <hr/>.
|
| + space_before_trailing_solidus=True|False
|
| + Places a space immediately before the closing slash in a tag
|
| + using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
|
| + sanitize=False|True
|
| + Strip all unsafe or unknown constructs from output.
|
| + See `html5lib user documentation`_
|
| + omit_optional_tags=True|False
|
| + Omit start/end tags that are optional.
|
| + alphabetical_attributes=False|True
|
| + Reorder attributes to be in alphabetical order.
|
| +
|
| + .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
|
| + """
|
| + if 'quote_char' in kwargs:
|
| + self.use_best_quote_char = False
|
| + for attr in self.options:
|
| + setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
| + self.errors = []
|
| + self.strict = False
|
| +
|
| + def encode(self, string):
|
| + assert(isinstance(string, text_type))
|
| + if self.encoding:
|
| + return string.encode(self.encoding, unicode_encode_errors)
|
| + else:
|
| + return string
|
| +
|
| + def encodeStrict(self, string):
|
| + assert(isinstance(string, text_type))
|
| + if self.encoding:
|
| + return string.encode(self.encoding, "strict")
|
| + else:
|
| + return string
|
| +
|
| + def serialize(self, treewalker, encoding=None):
|
| + self.encoding = encoding
|
| + in_cdata = False
|
| + self.errors = []
|
| +
|
| + if encoding and self.inject_meta_charset:
|
| + from ..filters.inject_meta_charset import Filter
|
| + treewalker = Filter(treewalker, encoding)
|
| + # WhitespaceFilter should be used before OptionalTagFilter
|
| + # for maximum efficiently of this latter filter
|
| + if self.strip_whitespace:
|
| + from ..filters.whitespace import Filter
|
| + treewalker = Filter(treewalker)
|
| + if self.sanitize:
|
| + from ..filters.sanitizer import Filter
|
| + treewalker = Filter(treewalker)
|
| + if self.omit_optional_tags:
|
| + from ..filters.optionaltags import Filter
|
| + treewalker = Filter(treewalker)
|
| + # Alphabetical attributes must be last, as other filters
|
| + # could add attributes and alter the order
|
| + if self.alphabetical_attributes:
|
| + from ..filters.alphabeticalattributes import Filter
|
| + treewalker = Filter(treewalker)
|
| +
|
| + for token in treewalker:
|
| + type = token["type"]
|
| + if type == "Doctype":
|
| + doctype = "<!DOCTYPE %s" % token["name"]
|
| +
|
| + if token["publicId"]:
|
| + doctype += ' PUBLIC "%s"' % token["publicId"]
|
| + elif token["systemId"]:
|
| + doctype += " SYSTEM"
|
| + if token["systemId"]:
|
| + if token["systemId"].find('"') >= 0:
|
| + if token["systemId"].find("'") >= 0:
|
| + self.serializeError(_("System identifer contains both single and double quote characters"))
|
| + quote_char = "'"
|
| + else:
|
| + quote_char = '"'
|
| + doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
|
| +
|
| + doctype += ">"
|
| + yield self.encodeStrict(doctype)
|
| +
|
| + elif type in ("Characters", "SpaceCharacters"):
|
| + if type == "SpaceCharacters" or in_cdata:
|
| + if in_cdata and token["data"].find("</") >= 0:
|
| + self.serializeError(_("Unexpected </ in CDATA"))
|
| + yield self.encode(token["data"])
|
| + else:
|
| + yield self.encode(escape(token["data"]))
|
| +
|
| + elif type in ("StartTag", "EmptyTag"):
|
| + name = token["name"]
|
| + yield self.encodeStrict("<%s" % name)
|
| + if name in rcdataElements and not self.escape_rcdata:
|
| + in_cdata = True
|
| + elif in_cdata:
|
| + self.serializeError(_("Unexpected child element of a CDATA element"))
|
| + for (attr_namespace, attr_name), attr_value in token["data"].items():
|
| + # TODO: Add namespace support here
|
| + k = attr_name
|
| + v = attr_value
|
| + yield self.encodeStrict(' ')
|
| +
|
| + yield self.encodeStrict(k)
|
| + if not self.minimize_boolean_attributes or \
|
| + (k not in booleanAttributes.get(name, tuple())
|
| + and k not in booleanAttributes.get("", tuple())):
|
| + yield self.encodeStrict("=")
|
| + if self.quote_attr_values or not v:
|
| + quote_attr = True
|
| + else:
|
| + quote_attr = reduce(lambda x, y: x or (y in v),
|
| + spaceCharacters + ">\"'=", False)
|
| + v = v.replace("&", "&")
|
| + if self.escape_lt_in_attrs:
|
| + v = v.replace("<", "<")
|
| + if quote_attr:
|
| + quote_char = self.quote_char
|
| + if self.use_best_quote_char:
|
| + if "'" in v and '"' not in v:
|
| + quote_char = '"'
|
| + elif '"' in v and "'" not in v:
|
| + quote_char = "'"
|
| + if quote_char == "'":
|
| + v = v.replace("'", "'")
|
| + else:
|
| + v = v.replace('"', """)
|
| + yield self.encodeStrict(quote_char)
|
| + yield self.encode(v)
|
| + yield self.encodeStrict(quote_char)
|
| + else:
|
| + yield self.encode(v)
|
| + if name in voidElements and self.use_trailing_solidus:
|
| + if self.space_before_trailing_solidus:
|
| + yield self.encodeStrict(" /")
|
| + else:
|
| + yield self.encodeStrict("/")
|
| + yield self.encode(">")
|
| +
|
| + elif type == "EndTag":
|
| + name = token["name"]
|
| + if name in rcdataElements:
|
| + in_cdata = False
|
| + elif in_cdata:
|
| + self.serializeError(_("Unexpected child element of a CDATA element"))
|
| + yield self.encodeStrict("</%s>" % name)
|
| +
|
| + elif type == "Comment":
|
| + data = token["data"]
|
| + if data.find("--") >= 0:
|
| + self.serializeError(_("Comment contains --"))
|
| + yield self.encodeStrict("<!--%s-->" % token["data"])
|
| +
|
| + elif type == "Entity":
|
| + name = token["name"]
|
| + key = name + ";"
|
| + if key not in entities:
|
| + self.serializeError(_("Entity %s not recognized" % name))
|
| + if self.resolve_entities and key not in xmlEntities:
|
| + data = entities[key]
|
| + else:
|
| + data = "&%s;" % name
|
| + yield self.encodeStrict(data)
|
| +
|
| + else:
|
| + self.serializeError(token["data"])
|
| +
|
| + def render(self, treewalker, encoding=None):
|
| + if encoding:
|
| + return b"".join(list(self.serialize(treewalker, encoding)))
|
| + else:
|
| + return "".join(list(self.serialize(treewalker)))
|
| +
|
| + def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
| + # XXX The idea is to make data mandatory.
|
| + self.errors.append(data)
|
| + if self.strict:
|
| + raise SerializeError
|
| +
|
| +
|
| +def SerializeError(Exception):
|
| + """Error in serialized tree"""
|
| + pass
|
|
|