Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(53)

Unified Diff: common/data/text/sanitizehtml/sanitize.go

Issue 2849353002: sanitizehtml: add a package to sanitize HTML (Closed)
Patch Set: fix comments Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: common/data/text/sanitizehtml/sanitize.go
diff --git a/common/data/text/sanitizehtml/sanitize.go b/common/data/text/sanitizehtml/sanitize.go
new file mode 100644
index 0000000000000000000000000000000000000000..ce06f04d5d9d107e25e0426e2e60c777c6ab54b8
--- /dev/null
+++ b/common/data/text/sanitizehtml/sanitize.go
@@ -0,0 +1,203 @@
+// Copyright 2017 The LUCI Authors. All rights reserved.
+// Use of this source code is governed under the Apache License, Version 2.0
+// that can be found in the LICENSE file.
+
+// Package sanitizehtml implements a sanitizer of a very limited HTML.
+// See Sanitize comment.
+package sanitizehtml
+
+import (
+ "bufio"
+ "io"
+ "net/url"
+ "strings"
+
+ "golang.org/x/net/html"
+ "golang.org/x/net/html/atom"
+)
+
+// attrValueSanitizer sanitizes an attribute value.
+type attrValueSanitizer func(string) string
+
+func alwaysSafe(s string) string {
+ return s
+}
+
+func sanitizeURL(s string) string {
+ const sanitizedPrefix = "about:invalid#sanitized&reason="
+ switch u, err := url.Parse(s); {
+ case err != nil:
+ return sanitizedPrefix + "malformed-url"
+
+ case u.Scheme != "http" && u.Scheme != "https":
+ return sanitizedPrefix + "disallowed-scheme"
+
+ case u.Host == "":
+ return sanitizedPrefix + "relative-url"
+
+ default:
+ // re-serialize the URL to ensure that what we return is what we think
+ // we parsed.
+ return u.String()
+ }
+}
+
+type attrMap map[string]attrValueSanitizer
+
+var (
+ anchorAttrs = attrMap{
+ "alt": alwaysSafe,
+ "href": sanitizeURL,
+ }
+ trAttrs = attrMap{
+ "rowspan": alwaysSafe,
+ "colspan": alwaysSafe,
+ }
+ tdAttrs = attrMap{
+ "rowspan": alwaysSafe,
+ "colspan": alwaysSafe,
+ }
+)
+
+type stringWriter interface {
+ WriteString(string) (int, error)
+}
+
+type sanitizer struct {
+ sw stringWriter
+ err error
+}
+
+// p prints the text, unless there was an error before.
+func (s *sanitizer) p(safeMarkup string) {
+ if s.err == nil {
+ _, s.err = s.sw.WriteString(safeMarkup)
+ }
+}
+
+// printAttrs sanitizes and prints a whitelist of attributes in el
+func (s *sanitizer) printAttrs(el *html.Node, whitelist attrMap) {
+ for _, a := range el.Attr {
+ key := strings.ToLower(a.Key)
+ if sanitizer, ok := whitelist[key]; a.Namespace == "" && ok {
+ s.p(" ")
+ s.p(key)
+ s.p("=\"")
+ s.p(html.EscapeString(sanitizer(a.Val)))
+ s.p("\"")
+ }
+ }
+}
+
+// printElem prints the safe element with a whitelist of attributes.
+// If allowedAttrs is nil, all attributes are omitted.
+//
+// Do not call for unsafe elements.
+func (s *sanitizer) printElem(safeElement *html.Node, allowedAttrs attrMap) {
+ tag := safeElement.DataAtom.String()
+ s.p("<")
+ s.p(tag)
+ if allowedAttrs == nil {
+ // ignore attributes
+ } else {
+ s.printAttrs(safeElement, allowedAttrs)
+ }
+ s.p(">")
+
+ s.visitChildren(safeElement)
+
+ s.p("</")
+ s.p(tag)
+ s.p(">")
+}
+
+func (s *sanitizer) visit(n *html.Node) {
+ switch n.Type {
+ case html.TextNode:
+ // print it escaped.
+ s.p(html.EscapeString(n.Data))
+
+ case html.ElementNode:
+ // This switch statement defines what HTML elements we allow.
+ switch n.DataAtom {
+ case atom.Br:
+ // br is allowed and it should not be closed
+ s.p("<br>")
+
+ case atom.Script, atom.Style:
+ // ignore entirely
+ // do not visit children so we don't print inner text
+
+ case atom.A:
+ s.p(`<a rel="noopener" target="_blank"`)
+ s.printAttrs(n, anchorAttrs)
+ s.p(">")
+ s.visitChildren(n)
+ s.p("</a>")
+
+ case atom.P, atom.Ol, atom.Ul, atom.Li, atom.Table, atom.Strong, atom.Em:
+ // print without attributes
+ s.printElem(n, nil)
+
+ case atom.Tr:
+ s.printElem(n, trAttrs)
+
+ case atom.Td:
+ s.printElem(n, tdAttrs)
+
+ default:
+ // ignore the element, but visit children.
+ s.visitChildren(n)
+ }
+
+ default:
+ // ignore the node, but visit children.
+ s.visitChildren(n)
+ }
+}
+
+func (s *sanitizer) visitChildren(n *html.Node) {
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ s.visit(c)
+ }
+}
+
+// Sanitize strips all HTML nodes except allowed ones.
+//
+// Unless explicitly specified, attributes are stripped.
+// Allowed elements:
+// - p, br
+// - strong, em
+// - a
+// - if href attribute is not a valid absolute HTTP(s) link, it is replaced
+// with an innocuous one.
+// - alt attribute is allowed
+// - ul, ol, li
+// - table
+// - tr, td. Attributes rowspan/colspan are allowed.
+//
+// Elements <script> and <style> are ignored entirely.
+// For all other HTML nodes, Sanitize ignores the node, but visits its children.
+func Sanitize(w io.Writer, r io.Reader) (err error) {
+ var root *html.Node
+ root, err = html.Parse(r)
+ if err != nil {
+ return err
+ }
+
+ sw, ok := w.(stringWriter)
+ if !ok {
+ bw := bufio.NewWriter(w)
+ defer func() {
+ ferr := bw.Flush()
+ if err == nil {
+ err = ferr
+ }
+ }()
+ sw = bw
+ }
+
+ s := sanitizer{sw: sw}
+ s.visit(root)
+ return s.err
+}
« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698