third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/serializer/htmlserializer.py - Issue 2635033005: Add html5lib to the list of modules in webkitpy/thirdparty/wpt/wpt/tools.

Unified Diff: third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/serializer/htmlserializer.py

Issue 2635033005: Add html5lib to the list of modules in webkitpy/thirdparty/wpt/wpt/tools. (Closed)

Patch Set: Update webkitpy/thirdparty/README.chromium. Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« no previous file with comments | « third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/serializer/__init__.py ('k') | third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/tokenizer.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/serializer/htmlserializer.py

diff --git a/third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/serializer/htmlserializer.py b/third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/serializer/htmlserializer.py

new file mode 100644

index 0000000000000000000000000000000000000000..4a891ff56c4eed1293d2763981584c1dbc4ab25a

--- /dev/null

+++ b/third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/serializer/htmlserializer.py

@@ -0,0 +1,320 @@

+from __future__ import absolute_import, division, unicode_literals

+from six import text_type

+import gettext

+_ = gettext.gettext

+try:

+ from functools import reduce

+except ImportError:

+ pass

+from ..constants import voidElements, booleanAttributes, spaceCharacters

+from ..constants import rcdataElements, entities, xmlEntities

+from .. import utils

+from xml.sax.saxutils import escape

+spaceCharacters = "".join(spaceCharacters)

+try:

+ from codecs import register_error, xmlcharrefreplace_errors

+except ImportError:

+ unicode_encode_errors = "strict"

+else:

+ unicode_encode_errors = "htmlentityreplace"

+ encode_entity_map = {}

+ is_ucs4 = len("\U0010FFFF") == 1

+ for k, v in list(entities.items()):

+ # skip multi-character entities

+ if ((is_ucs4 and len(v) > 1) or

+ (not is_ucs4 and len(v) > 2)):

+ continue

+ if v != "&":

+ if len(v) == 2:

+ v = utils.surrogatePairToCodepoint(v)

+ else:

+ v = ord(v)

+ if v not in encode_entity_map or k.islower():

+ # prefer < over &LT; and similarly for &, >, etc.

+ encode_entity_map[v] = k

+ def htmlentityreplace_errors(exc):

+ if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):

+ res = []

+ codepoints = []

+ skip = False

+ for i, c in enumerate(exc.object[exc.start:exc.end]):

+ if skip:

+ skip = False

+ continue

+ index = i + exc.start

+ if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):

+ codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])

+ skip = True

+ else:

+ codepoint = ord(c)

+ codepoints.append(codepoint)

+ for cp in codepoints:

+ e = encode_entity_map.get(cp)

+ if e:

+ res.append("&")

+ res.append(e)

+ if not e.endswith(";"):

+ res.append(";")

+ else:

+ res.append("&#x%s;" % (hex(cp)[2:]))

+ return ("".join(res), exc.end)

+ else:

+ return xmlcharrefreplace_errors(exc)

+ register_error(unicode_encode_errors, htmlentityreplace_errors)

+ del register_error

+class HTMLSerializer(object):

+ # attribute quoting options

+ quote_attr_values = False

+ quote_char = '"'

+ use_best_quote_char = True

+ # tag syntax options

+ omit_optional_tags = True

+ minimize_boolean_attributes = True

+ use_trailing_solidus = False

+ space_before_trailing_solidus = True

+ # escaping options

+ escape_lt_in_attrs = False

+ escape_rcdata = False

+ resolve_entities = True

+ # miscellaneous options

+ alphabetical_attributes = False

+ inject_meta_charset = True

+ strip_whitespace = False

+ sanitize = False

+ options = ("quote_attr_values", "quote_char", "use_best_quote_char",

+ "omit_optional_tags", "minimize_boolean_attributes",

+ "use_trailing_solidus", "space_before_trailing_solidus",

+ "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",

+ "alphabetical_attributes", "inject_meta_charset",

+ "strip_whitespace", "sanitize")

+ def __init__(self, **kwargs):

+ """Initialize HTMLSerializer.

+ Keyword options (default given first unless specified) include:

+ inject_meta_charset=True|False

+ Whether it insert a meta element to define the character set of the

+ document.

+ quote_attr_values=True|False

+ Whether to quote attribute values that don't require quoting

+ per HTML5 parsing rules.

+ quote_char=u'"'|u"'"

+ Use given quote character for attribute quoting. Default is to

+ use double quote unless attribute value contains a double quote,

+ in which case single quotes are used instead.

+ escape_lt_in_attrs=False|True

+ Whether to escape < in attribute values.

+ escape_rcdata=False|True

+ Whether to escape characters that need to be escaped within normal

+ elements within rcdata elements such as style.

+ resolve_entities=True|False

+ Whether to resolve named character entities that appear in the

+ source tree. The XML predefined entities < > & " '

+ are unaffected by this setting.

+ strip_whitespace=False|True

+ Whether to remove semantically meaningless whitespace. (This

+ compresses all whitespace to a single space except within pre.)

+ minimize_boolean_attributes=True|False

+ Shortens boolean attributes to give just the attribute value,

+ for example <input disabled="disabled"> becomes <input disabled>.

+ use_trailing_solidus=False|True

+ Includes a close-tag slash at the end of the start tag of void

+ elements (empty elements whose end tag is forbidden). E.g. <hr/>.

+ space_before_trailing_solidus=True|False

+ Places a space immediately before the closing slash in a tag

+ using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.

+ sanitize=False|True

+ Strip all unsafe or unknown constructs from output.

+ See `html5lib user documentation`_

+ omit_optional_tags=True|False

+ Omit start/end tags that are optional.

+ alphabetical_attributes=False|True

+ Reorder attributes to be in alphabetical order.

+ .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation

+ """

+ if 'quote_char' in kwargs:

+ self.use_best_quote_char = False

+ for attr in self.options:

+ setattr(self, attr, kwargs.get(attr, getattr(self, attr)))

+ self.errors = []

+ self.strict = False

+ def encode(self, string):

+ assert(isinstance(string, text_type))

+ if self.encoding:

+ return string.encode(self.encoding, unicode_encode_errors)

+ else:

+ return string

+ def encodeStrict(self, string):

+ assert(isinstance(string, text_type))

+ if self.encoding:

+ return string.encode(self.encoding, "strict")

+ else:

+ return string

+ def serialize(self, treewalker, encoding=None):

+ self.encoding = encoding

+ in_cdata = False

+ self.errors = []

+ if encoding and self.inject_meta_charset:

+ from ..filters.inject_meta_charset import Filter

+ treewalker = Filter(treewalker, encoding)

+ # WhitespaceFilter should be used before OptionalTagFilter

+ # for maximum efficiently of this latter filter

+ if self.strip_whitespace:

+ from ..filters.whitespace import Filter

+ treewalker = Filter(treewalker)

+ if self.sanitize:

+ from ..filters.sanitizer import Filter

+ treewalker = Filter(treewalker)

+ if self.omit_optional_tags:

+ from ..filters.optionaltags import Filter

+ treewalker = Filter(treewalker)

+ # Alphabetical attributes must be last, as other filters

+ # could add attributes and alter the order

+ if self.alphabetical_attributes:

+ from ..filters.alphabeticalattributes import Filter

+ treewalker = Filter(treewalker)

+ for token in treewalker:

+ type = token["type"]

+ if type == "Doctype":

+ doctype = "<!DOCTYPE %s" % token["name"]

+ if token["publicId"]:

+ doctype += ' PUBLIC "%s"' % token["publicId"]

+ elif token["systemId"]:

+ doctype += " SYSTEM"

+ if token["systemId"]:

+ if token["systemId"].find('"') >= 0:

+ if token["systemId"].find("'") >= 0:

+ self.serializeError(_("System identifer contains both single and double quote characters"))

+ quote_char = "'"

+ else:

+ quote_char = '"'

+ doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)

+ doctype += ">"

+ yield self.encodeStrict(doctype)

+ elif type in ("Characters", "SpaceCharacters"):

+ if type == "SpaceCharacters" or in_cdata:

+ if in_cdata and token["data"].find("</") >= 0:

+ self.serializeError(_("Unexpected </ in CDATA"))

+ yield self.encode(token["data"])

+ else:

+ yield self.encode(escape(token["data"]))

+ elif type in ("StartTag", "EmptyTag"):

+ name = token["name"]

+ yield self.encodeStrict("<%s" % name)

+ if name in rcdataElements and not self.escape_rcdata:

+ in_cdata = True

+ elif in_cdata:

+ self.serializeError(_("Unexpected child element of a CDATA element"))

+ for (attr_namespace, attr_name), attr_value in token["data"].items():

+ # TODO: Add namespace support here

+ k = attr_name

+ v = attr_value

+ yield self.encodeStrict(' ')

+ yield self.encodeStrict(k)

+ if not self.minimize_boolean_attributes or \

+ (k not in booleanAttributes.get(name, tuple())

+ and k not in booleanAttributes.get("", tuple())):

+ yield self.encodeStrict("=")

+ if self.quote_attr_values or not v:

+ quote_attr = True

+ else:

+ quote_attr = reduce(lambda x, y: x or (y in v),

+ spaceCharacters + ">\"'=", False)

+ v = v.replace("&", "&")

+ if self.escape_lt_in_attrs:

+ v = v.replace("<", "<")

+ if quote_attr:

+ quote_char = self.quote_char

+ if self.use_best_quote_char:

+ if "'" in v and '"' not in v:

+ quote_char = '"'

+ elif '"' in v and "'" not in v:

+ quote_char = "'"

+ if quote_char == "'":

+ v = v.replace("'", "'")

+ else:

+ v = v.replace('"', """)

+ yield self.encodeStrict(quote_char)

+ yield self.encode(v)

+ yield self.encodeStrict(quote_char)

+ else:

+ yield self.encode(v)

+ if name in voidElements and self.use_trailing_solidus:

+ if self.space_before_trailing_solidus:

+ yield self.encodeStrict(" /")

+ else:

+ yield self.encodeStrict("/")

+ yield self.encode(">")

+ elif type == "EndTag":

+ name = token["name"]

+ if name in rcdataElements:

+ in_cdata = False

+ elif in_cdata:

+ self.serializeError(_("Unexpected child element of a CDATA element"))

+ yield self.encodeStrict("</%s>" % name)

+ elif type == "Comment":

+ data = token["data"]

+ if data.find("--") >= 0:

+ self.serializeError(_("Comment contains --"))

+ yield self.encodeStrict("" % token["data"])

+ elif type == "Entity":

+ name = token["name"]

+ key = name + ";"

+ if key not in entities:

+ self.serializeError(_("Entity %s not recognized" % name))

+ if self.resolve_entities and key not in xmlEntities:

+ data = entities[key]

+ else:

+ data = "&%s;" % name

+ yield self.encodeStrict(data)

+ else:

+ self.serializeError(token["data"])

+ def render(self, treewalker, encoding=None):

+ if encoding:

+ return b"".join(list(self.serialize(treewalker, encoding)))

+ else:

+ return "".join(list(self.serialize(treewalker)))

+ def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):

+ # XXX The idea is to make data mandatory.

+ self.errors.append(data)

+ if self.strict:

+ raise SerializeError

+def SerializeError(Exception):

+ """Error in serialized tree"""

+ pass