Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(17)

Unified Diff: common/data/text/sanitizehtml/sanitize.go

Issue 2849353002: sanitizehtml: add a package to sanitize HTML (Closed)
Patch Set: add test Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: common/data/text/sanitizehtml/sanitize.go
diff --git a/common/data/text/sanitizehtml/sanitize.go b/common/data/text/sanitizehtml/sanitize.go
new file mode 100644
index 0000000000000000000000000000000000000000..59c6cf0a3242afb2399336fa1735b55110cef327
--- /dev/null
+++ b/common/data/text/sanitizehtml/sanitize.go
@@ -0,0 +1,210 @@
+// Copyright 2017 The LUCI Authors. All rights reserved.
+// Use of this source code is governed under the Apache License, Version 2.0
+// that can be found in the LICENSE file.
+
+// Package sanitizehtml implements a sanitizer of a very limited HTML.
+// See Sanitize comment.
+package sanitizehtml
+
+import (
+ "io"
+ "net/url"
+ "strings"
+ "unicode"
+
+ "golang.org/x/net/html"
+)
+
+// attrValueSanitizer sanitizes an attribute value.
+// If flagged was returned as true, the value was possibly harmful,
+// i.e. possibly it is an attack.
+type attrValueSanitizer func(value string) (flagged bool, safeValue string)
+
+func alwaysSafe(s string) (bool, string) {
+ return false, s
+}
+
+func sanitizeNum(s string) (bool, string) {
+ for i, r := range s {
+ if !unicode.IsDigit(r) {
nigeltao1 2017/05/04 23:44:23 I'd just look for ASCII digits. It's not like you'
nodir 2017/05/05 05:23:07 Done.
+ // ignore r and the rest.
+ return false, s[:i]
+ }
+ }
+ return false, s
+}
+
+func sanitizeURL(s string) (bool, string) {
+ switch u, err := url.Parse(s); {
+ case err != nil:
+ return false, "#invalid-url-stripped"
xtof 2017/05/05 15:29:16 It would be preferable to use about:invalid#reason
nodir 2017/05/05 16:10:06 Done. I was doing what Gitiles is doing: https://g
+ case strings.EqualFold(u.Scheme, "javascript"):
+ return true, "#non-http-or-https-url-stripped"
+ case u.Scheme != "http" && u.Scheme != "https":
+ return false, "#non-http-or-https-url-stripped"
+ case u.Host == "":
+ return false, "#relative-url-stripped"
+ default:
+ // re-serialize the URL to ensure that what we return is what we think
+ // we parsed.
+ return false, u.String()
+ }
+}
+
+type attrMap map[string]attrValueSanitizer
+
+var (
+ anchorAttrs = attrMap{
+ "alt": alwaysSafe,
+ "href": sanitizeURL,
+ }
+ trAttrs = attrMap{
+ "rowspan": sanitizeNum,
xtof 2017/05/05 15:29:16 If you want to remove a little code, feel free to
nodir 2017/05/05 16:10:06 Done. I am always for deleting code.
+ "colspan": sanitizeNum,
+ }
+ tdAttrs = attrMap{
+ "rowspan": sanitizeNum,
+ "colspan": sanitizeNum,
+ }
+)
+
+type sanitizer struct {
+ w io.Writer
nigeltao1 2017/05/04 23:44:23 If you care enough about efficiency (both in terms
nodir 2017/05/05 05:23:07 Done.
+ err error
+ flagged bool
+}
+
+// p prints the text, unless there was an error before.
+func (s *sanitizer) p(safeMarkup string) {
+ if s.err == nil {
+ _, s.err = io.WriteString(s.w, safeMarkup)
+ }
+}
+
+// printAttrs sanitizes and prints a whitelist of attributes in el
+func (s *sanitizer) printAttrs(el *html.Node, whitelist attrMap) {
+ for _, a := range el.Attr {
+ key := strings.ToLower(a.Key)
+ if sanitizer, ok := whitelist[key]; a.Namespace == "" && ok {
+ s.p(" ")
+ s.p(key)
+ s.p("=\"")
+ flagged, safeValue := sanitizer(a.Val)
+ if flagged {
+ s.flagged = true
+ }
+ s.p(html.EscapeString(safeValue))
+ s.p("\"")
+ }
+ }
+}
+
+// printElem prints the safe element with a whitelist of attributes.
+// If allowedAttrs is nil, all attributes are ommitted.
nigeltao1 2017/05/04 23:44:23 Typo in "omitted".
nodir 2017/05/05 05:23:07 Done.
+//
+// Do not call for unsafe elements.
+func (s *sanitizer) printElem(safeElement *html.Node, allowedAttrs attrMap) {
+ tag := strings.ToLower(safeElement.Data)
+ s.p("<")
+ s.p(tag)
+ if allowedAttrs == nil {
+ // ignore attributes
+ } else {
+ s.printAttrs(safeElement, allowedAttrs)
+ }
+ s.p(">")
+
+ s.visitChildren(safeElement)
+
+ s.p("</")
+ s.p(tag)
+ s.p(">")
+}
+
+func (s *sanitizer) visit(n *html.Node) {
+ switch n.Type {
+ case html.TextNode:
+ // print it escaped.
+ s.p(html.EscapeString(n.Data))
+
+ case html.ElementNode:
+ // This switch statement defines what HTML elements we allow.
+ switch strings.ToLower(n.Data) {
nigeltao1 2017/05/04 23:44:23 The ToLower'ing is unnecessary if you compare atom
nodir 2017/05/05 05:23:07 nice, thanks, done
+ case "br":
+ // br is allowed and it should not be closed
+ s.p("<br>")
+
+ case "script":
+ // ignore entirely
+ // do not visit children so we don't print inner text
+ s.flagged = true
+
+ case "style":
+ // ignore entirely
+ // do not visit children so we don't print inner text
+
+ case "a":
+ s.p(`<a rel="noopener" target="_blank"`)
+ s.printAttrs(n, anchorAttrs)
+ s.p(">")
+ s.visitChildren(n)
+ s.p("</a>")
+
+ case "p", "ol", "ul", "li", "table", "strong", "em":
+ // print without attributes
+ s.printElem(n, nil)
+
+ case "tr":
+ s.printElem(n, trAttrs)
+
+ case "td":
+ s.printElem(n, tdAttrs)
+
+ default:
+ // ignore the element, but visit children.
+ s.visitChildren(n)
+ }
+
+ default:
+ // ignore the node, but visit children.
+ s.visitChildren(n)
+ }
+}
+
+func (s *sanitizer) visitChildren(n *html.Node) {
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ s.visit(c)
+ }
+}
+
+// Sanitize strips all HTML nodes except allowed ones.
+//
+// Unless explicitly specified, attributes are stripped.
+// Allowed elements:
+// - p, br
+// - strong, em
+// - a
+// - if href attribute is not a valid absolute HTTP(s) link, it is replaced
+// with a innocuous fragment-only link.
+// - alt attribute is allowed
+// - ul, ol, li
+// - table
+// - tr, td. Attributes rowspan/colspan are allowed, but if a value contains a
+// non-digit character, the character and the rest of the value is stripped.
+//
+// Elements <script> and <style> are ignored entirely.
+// For all other HTML nodes, Sanitize ignores the node, but visits its children.
+//
+// The returned value flagged, if true, means that the input HTML was possibly
+// harmful and advised to be logged.
xtof 2017/05/05 15:29:16 There are many other ways HTML can execute script
nodir 2017/05/05 16:10:06 Removed
+func Sanitize(r io.Reader, w io.Writer) (flagged bool, err error) {
+ var root *html.Node
+ root, err = html.Parse(r)
+ if err != nil {
+ return
+ }
+
+ s := sanitizer{w: w}
+ s.visit(root)
+ return s.flagged, s.err
+}
« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698