common/data/text/sanitizehtml/sanitize.go - Issue 2849353002: sanitizehtml: add a package to sanitize HTML

Unified Diff: common/data/text/sanitizehtml/sanitize.go

Issue 2849353002: sanitizehtml: add a package to sanitize HTML (Closed)

Patch Set: add test Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: common/data/text/sanitizehtml/sanitize.go

diff --git a/common/data/text/sanitizehtml/sanitize.go b/common/data/text/sanitizehtml/sanitize.go

new file mode 100644

index 0000000000000000000000000000000000000000..59c6cf0a3242afb2399336fa1735b55110cef327

--- /dev/null

+++ b/common/data/text/sanitizehtml/sanitize.go

@@ -0,0 +1,210 @@

+// Use of this source code is governed under the Apache License, Version 2.0

+// that can be found in the LICENSE file.

+// Package sanitizehtml implements a sanitizer of a very limited HTML.

+// See Sanitize comment.

+package sanitizehtml

+import (

+ "io"

+ "net/url"

+ "strings"

+ "unicode"

+ "golang.org/x/net/html"

+// attrValueSanitizer sanitizes an attribute value.

+// If flagged was returned as true, the value was possibly harmful,

+// i.e. possibly it is an attack.

+type attrValueSanitizer func(value string) (flagged bool, safeValue string)

+func alwaysSafe(s string) (bool, string) {

+ return false, s

+func sanitizeNum(s string) (bool, string) {

+ for i, r := range s {

+ if !unicode.IsDigit(r) {

nigeltao1 2017/05/04 23:44:23 I'd just look for ASCII digits. It's not like you'

nodir 2017/05/05 05:23:07 Done.

+ // ignore r and the rest.

+ return false, s[:i]

+ }

+ return false, s

+func sanitizeURL(s string) (bool, string) {

+ switch u, err := url.Parse(s); {

+ case err != nil:

+ return false, "#invalid-url-stripped"

xtof 2017/05/05 15:29:16 It would be preferable to use about:invalid#reason

nodir 2017/05/05 16:10:06 Done. I was doing what Gitiles is doing: https://g

+ case strings.EqualFold(u.Scheme, "javascript"):

+ return true, "#non-http-or-https-url-stripped"

+ case u.Scheme != "http" && u.Scheme != "https":

+ return false, "#non-http-or-https-url-stripped"

+ case u.Host == "":

+ return false, "#relative-url-stripped"

+ default:

+ // re-serialize the URL to ensure that what we return is what we think

+ // we parsed.

+ return false, u.String()

+ }

+type attrMap map[string]attrValueSanitizer

+var (

+ anchorAttrs = attrMap{

+ "alt": alwaysSafe,

+ "href": sanitizeURL,

+ }

+ trAttrs = attrMap{

+ "rowspan": sanitizeNum,

xtof 2017/05/05 15:29:16 If you want to remove a little code, feel free to

nodir 2017/05/05 16:10:06 Done. I am always for deleting code.

+ "colspan": sanitizeNum,

+ }

+ tdAttrs = attrMap{

+ "rowspan": sanitizeNum,

+ "colspan": sanitizeNum,

+ }

+type sanitizer struct {

+ w io.Writer

nigeltao1 2017/05/04 23:44:23 If you care enough about efficiency (both in terms

nodir 2017/05/05 05:23:07 Done.

+ err error

+ flagged bool

+// p prints the text, unless there was an error before.

+func (s *sanitizer) p(safeMarkup string) {

+ if s.err == nil {

+ _, s.err = io.WriteString(s.w, safeMarkup)

+ }

+// printAttrs sanitizes and prints a whitelist of attributes in el

+func (s *sanitizer) printAttrs(el *html.Node, whitelist attrMap) {

+ for _, a := range el.Attr {

+ key := strings.ToLower(a.Key)

+ if sanitizer, ok := whitelist[key]; a.Namespace == "" && ok {

+ s.p(" ")

+ s.p(key)

+ s.p("=\"")

+ flagged, safeValue := sanitizer(a.Val)

+ if flagged {

+ s.flagged = true

+ }

+ s.p(html.EscapeString(safeValue))

+ s.p("\"")

+ }

+// printElem prints the safe element with a whitelist of attributes.

+// If allowedAttrs is nil, all attributes are ommitted.

nigeltao1 2017/05/04 23:44:23 Typo in "omitted".

nodir 2017/05/05 05:23:07 Done.

+//

+// Do not call for unsafe elements.

+func (s *sanitizer) printElem(safeElement *html.Node, allowedAttrs attrMap) {

+ tag := strings.ToLower(safeElement.Data)

+ s.p("<")

+ s.p(tag)

+ if allowedAttrs == nil {

+ // ignore attributes

+ } else {

+ s.printAttrs(safeElement, allowedAttrs)

+ }

+ s.p(">")

+ s.visitChildren(safeElement)

+ s.p("</")

+ s.p(tag)

+ s.p(">")

+func (s *sanitizer) visit(n *html.Node) {

+ switch n.Type {

+ case html.TextNode:

+ // print it escaped.

+ s.p(html.EscapeString(n.Data))

+ case html.ElementNode:

+ // This switch statement defines what HTML elements we allow.

+ switch strings.ToLower(n.Data) {

nigeltao1 2017/05/04 23:44:23 The ToLower'ing is unnecessary if you compare atom

nodir 2017/05/05 05:23:07 nice, thanks, done

+ case "br":

+ // br is allowed and it should not be closed

+ s.p("<br>")

+ case "script":

+ // ignore entirely

+ // do not visit children so we don't print inner text

+ s.flagged = true

+ case "style":

+ // ignore entirely

+ // do not visit children so we don't print inner text

+ case "a":

+ s.p(`<a rel="noopener" target="_blank"`)

+ s.printAttrs(n, anchorAttrs)

+ s.p(">")

+ s.visitChildren(n)

+ s.p("</a>")

+ case "p", "ol", "ul", "li", "table", "strong", "em":

+ // print without attributes

+ s.printElem(n, nil)

+ case "tr":

+ s.printElem(n, trAttrs)

+ case "td":

+ s.printElem(n, tdAttrs)

+ default:

+ // ignore the element, but visit children.

+ s.visitChildren(n)

+ }

+ default:

+ // ignore the node, but visit children.

+ s.visitChildren(n)

+ }

+func (s *sanitizer) visitChildren(n *html.Node) {

+ for c := n.FirstChild; c != nil; c = c.NextSibling {

+ s.visit(c)

+ }

+// Sanitize strips all HTML nodes except allowed ones.

+//

+// Unless explicitly specified, attributes are stripped.

+// Allowed elements:

+// - p, br

+// - strong, em

+// - a

+// - if href attribute is not a valid absolute HTTP(s) link, it is replaced

+// with a innocuous fragment-only link.

+// - alt attribute is allowed

+// - ul, ol, li

+// - table

+// - tr, td. Attributes rowspan/colspan are allowed, but if a value contains a

+// non-digit character, the character and the rest of the value is stripped.

+//

+// Elements <script> and <style> are ignored entirely.

+// For all other HTML nodes, Sanitize ignores the node, but visits its children.

+//

+// The returned value flagged, if true, means that the input HTML was possibly

+// harmful and advised to be logged.

xtof 2017/05/05 15:29:16 There are many other ways HTML can execute script

nodir 2017/05/05 16:10:06 Removed

+func Sanitize(r io.Reader, w io.Writer) (flagged bool, err error) {

+ var root *html.Node

+ root, err = html.Parse(r)

+ if err != nil {

+ return

+ }

+ s := sanitizer{w: w}

+ s.visit(root)

+ return s.flagged, s.err

« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | no next file with comments »