common/data/text/sanitizehtml/sanitize.go - Issue 2849353002: sanitizehtml: add a package to sanitize HTML

Unified Diff: common/data/text/sanitizehtml/sanitize.go

Issue 2849353002: sanitizehtml: add a package to sanitize HTML (Closed)

Patch Set: fix comments Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: common/data/text/sanitizehtml/sanitize.go

diff --git a/common/data/text/sanitizehtml/sanitize.go b/common/data/text/sanitizehtml/sanitize.go

new file mode 100644

index 0000000000000000000000000000000000000000..ce06f04d5d9d107e25e0426e2e60c777c6ab54b8

--- /dev/null

+++ b/common/data/text/sanitizehtml/sanitize.go

@@ -0,0 +1,203 @@

+// Use of this source code is governed under the Apache License, Version 2.0

+// that can be found in the LICENSE file.

+// Package sanitizehtml implements a sanitizer of a very limited HTML.

+// See Sanitize comment.

+package sanitizehtml

+import (

+ "bufio"

+ "io"

+ "net/url"

+ "strings"

+ "golang.org/x/net/html"

+ "golang.org/x/net/html/atom"

+// attrValueSanitizer sanitizes an attribute value.

+type attrValueSanitizer func(string) string

+func alwaysSafe(s string) string {

+ return s

+func sanitizeURL(s string) string {

+ const sanitizedPrefix = "about:invalid#sanitized&reason="

+ switch u, err := url.Parse(s); {

+ case err != nil:

+ return sanitizedPrefix + "malformed-url"

+ case u.Scheme != "http" && u.Scheme != "https":

+ return sanitizedPrefix + "disallowed-scheme"

+ case u.Host == "":

+ return sanitizedPrefix + "relative-url"

+ default:

+ // re-serialize the URL to ensure that what we return is what we think

+ // we parsed.

+ return u.String()

+ }

+type attrMap map[string]attrValueSanitizer

+var (

+ anchorAttrs = attrMap{

+ "alt": alwaysSafe,

+ "href": sanitizeURL,

+ }

+ trAttrs = attrMap{

+ "rowspan": alwaysSafe,

+ "colspan": alwaysSafe,

+ }

+ tdAttrs = attrMap{

+ "rowspan": alwaysSafe,

+ "colspan": alwaysSafe,

+ }

+type stringWriter interface {

+ WriteString(string) (int, error)

+type sanitizer struct {

+ sw stringWriter

+ err error

+// p prints the text, unless there was an error before.

+func (s *sanitizer) p(safeMarkup string) {

+ if s.err == nil {

+ _, s.err = s.sw.WriteString(safeMarkup)

+ }

+// printAttrs sanitizes and prints a whitelist of attributes in el

+func (s *sanitizer) printAttrs(el *html.Node, whitelist attrMap) {

+ for _, a := range el.Attr {

+ key := strings.ToLower(a.Key)

+ if sanitizer, ok := whitelist[key]; a.Namespace == "" && ok {

+ s.p(" ")

+ s.p(key)

+ s.p("=\"")

+ s.p(html.EscapeString(sanitizer(a.Val)))

+ s.p("\"")

+ }

+// printElem prints the safe element with a whitelist of attributes.

+// If allowedAttrs is nil, all attributes are omitted.

+//

+// Do not call for unsafe elements.

+func (s *sanitizer) printElem(safeElement *html.Node, allowedAttrs attrMap) {

+ tag := safeElement.DataAtom.String()

+ s.p("<")

+ s.p(tag)

+ if allowedAttrs == nil {

+ // ignore attributes

+ } else {

+ s.printAttrs(safeElement, allowedAttrs)

+ }

+ s.p(">")

+ s.visitChildren(safeElement)

+ s.p("</")

+ s.p(tag)

+ s.p(">")

+func (s *sanitizer) visit(n *html.Node) {

+ switch n.Type {

+ case html.TextNode:

+ // print it escaped.

+ s.p(html.EscapeString(n.Data))

+ case html.ElementNode:

+ // This switch statement defines what HTML elements we allow.

+ switch n.DataAtom {

+ case atom.Br:

+ // br is allowed and it should not be closed

+ s.p("<br>")

+ case atom.Script, atom.Style:

+ // ignore entirely

+ // do not visit children so we don't print inner text

+ case atom.A:

+ s.p(`<a rel="noopener" target="_blank"`)

+ s.printAttrs(n, anchorAttrs)

+ s.p(">")

+ s.visitChildren(n)

+ s.p("</a>")

+ case atom.P, atom.Ol, atom.Ul, atom.Li, atom.Table, atom.Strong, atom.Em:

+ // print without attributes

+ s.printElem(n, nil)

+ case atom.Tr:

+ s.printElem(n, trAttrs)

+ case atom.Td:

+ s.printElem(n, tdAttrs)

+ default:

+ // ignore the element, but visit children.

+ s.visitChildren(n)

+ }

+ default:

+ // ignore the node, but visit children.

+ s.visitChildren(n)

+ }

+func (s *sanitizer) visitChildren(n *html.Node) {

+ for c := n.FirstChild; c != nil; c = c.NextSibling {

+ s.visit(c)

+ }

+// Sanitize strips all HTML nodes except allowed ones.

+//

+// Unless explicitly specified, attributes are stripped.

+// Allowed elements:

+// - p, br

+// - strong, em

+// - a

+// - if href attribute is not a valid absolute HTTP(s) link, it is replaced

+// with an innocuous one.

+// - alt attribute is allowed

+// - ul, ol, li

+// - table

+// - tr, td. Attributes rowspan/colspan are allowed.

+//

+// Elements <script> and <style> are ignored entirely.

+// For all other HTML nodes, Sanitize ignores the node, but visits its children.

+func Sanitize(w io.Writer, r io.Reader) (err error) {

+ var root *html.Node

+ root, err = html.Parse(r)

+ if err != nil {

+ return err

+ }

+ sw, ok := w.(stringWriter)

+ if !ok {

+ bw := bufio.NewWriter(w)

+ defer func() {

+ ferr := bw.Flush()

+ if err == nil {

+ err = ferr

+ }

+ }()

+ sw = bw

+ }

+ s := sanitizer{sw: sw}

+ s.visit(root)

+ return s.err

« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | no next file with comments »