Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(18)

Unified Diff: common/data/text/sanitizehtml/sanitize.go

Issue 2849353002: sanitizehtml: add a package to sanitize HTML (Closed)
Patch Set: nit Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: common/data/text/sanitizehtml/sanitize.go
diff --git a/common/data/text/sanitizehtml/sanitize.go b/common/data/text/sanitizehtml/sanitize.go
new file mode 100644
index 0000000000000000000000000000000000000000..0e03c8c6e751c41f1ce027d2658efb8cd9f78f7f
--- /dev/null
+++ b/common/data/text/sanitizehtml/sanitize.go
@@ -0,0 +1,159 @@
+// Copyright 2017 The LUCI Authors. All rights reserved.
+// Use of this source code is governed under the Apache License, Version 2.0
+// that can be found in the LICENSE file.
+
+// Package sanitizehtml implements a sanitizer of a very limited HTML.
Vadim Sh. 2017/05/02 00:16:24 it would be great to elaborate what exact subset o
nodir 2017/05/04 22:11:00 done, in Sanitize comment
+package sanitizehtml
+
+import (
+ "io"
+ "net/url"
+ "strings"
+ "unicode"
+
+ "golang.org/x/net/html"
+)
+
+// attrValueSanitizer sanitizes an attribute value.
+type attrValueSanitizer func(string) string
+
+func alwaysSafe(s string) string {
+ return s
+}
+
+func sanitizeNum(s string) string {
+ for i, r := range s {
+ if !unicode.IsDigit(r) {
+ // ignore r and the rest.
+ return s[:i]
+ }
+ }
+ return s
+}
+
+func sanitizeURL(s string) string {
+ switch u, err := url.Parse(s); {
+ case err != nil:
+ return "#invalid-url-stripped"
+ case u.Scheme != "http" && u.Scheme != "https":
+ return "#non-http-or-https-url-stripped"
+ case u.Host == "":
+ return "#relative-url-stripped"
+ default:
+ return s
hinoka 2017/05/02 00:36:15 return u.EscapedPath() seems safer. It'll force-es
xtof 2017/05/04 18:26:18 Not-properly escaped query parameters are generall
nodir 2017/05/04 22:11:00 Done.
+ }
+}
+
+// allowedElems defines allowed HTML elements.
+// A key is a lower-case element name and a value defines allowed attributes.
+// The latter is mapping from lower case attribute name to a value sanitizer.
+// If the map is nil, no attributes are allowed.
+var allowedElems = map[string]map[string]attrValueSanitizer{
+ "a": {
Vadim Sh. 2017/05/02 00:16:24 may be a good idea to mandatory add rel="noopener"
nodir 2017/05/04 22:11:00 Done.
+ "alt": alwaysSafe,
+ "href": sanitizeURL,
+ },
+
+ "br": nil,
+ "p": nil,
+
+ "ol": nil,
+ "ul": nil,
+ "li": nil,
+
+ "strong": nil,
+ "em": nil,
+
+ "table": nil,
+ "tr": {
+ "rowspan": sanitizeNum,
+ "colspan": sanitizeNum,
+ },
+ "td": {
+ "rowspan": sanitizeNum,
+ "colspan": sanitizeNum,
+ },
+}
+
+// Sanitize strips all HTML nodes except
+// - paragraphs
+// - strong/emphasized text
+// - valid absolute HTTP(S) links with optional alt.
+// - ordered/unordered lists
+// - tables with optional colspan/rowspan.
+// Anything else is stripped.
+func Sanitize(r io.Reader, w io.Writer) error {
+ root, err := html.Parse(r)
+ if err != nil {
+ return err
+ }
+
+ var writeError error
+ write := func(s string) {
+ if writeError == nil {
+ _, writeError = io.WriteString(w, s)
+ }
+ }
+
+ var visit func(n *html.Node)
+ visitChildren := func(n *html.Node) {
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ visit(c)
+ }
+ }
+ visit = func(n *html.Node) {
+ switch n.Type {
+ case html.TextNode:
+ // print it escaped.
+ write(html.EscapeString(n.Data))
+
+ case html.ElementNode:
+ tag := strings.ToLower(n.Data)
+ if tag == "br" {
+ // br is allowed and it should not be closed
+ write("<br>")
+ return
+ }
+
+ allowedAttrs, allowed := allowedElems[tag]
+ if !allowed {
+ // ignore the tag, but visit children.
+ visitChildren(n)
+ return
+ }
+
+ // this tag is allowed.
+
+ write("<")
+ write(tag)
+ if allowedAttrs == nil {
+ // ignore attributes
+ } else {
+ // write only the allowed attributes.
+ for _, a := range n.Attr {
+ key := strings.ToLower(a.Key)
+ if sanitizer, ok := allowedAttrs[key]; a.Namespace == "" && ok {
hinoka 2017/05/02 00:36:15 Using template/html execute might be better here b
xtof 2017/05/04 18:26:18 I think what's here is fine, and (much) simpler.
nodir 2017/05/04 22:11:00 Acknowledged.
+ write(" ")
+ write(key)
+ write("=\"")
+ write(html.EscapeString(sanitizer(a.Val)))
+ write("\"")
+ }
+ }
+ }
+ write(">")
+
+ visitChildren(n)
+
+ write("</")
+ write(n.Data)
xtof 2017/05/04 18:26:18 write(tag) would be slightly more "obviously safe"
nodir 2017/05/04 22:11:01 I've changed it a bit. I hope it is still OK becau
+ write(">")
+
+ default:
+ // ignore the tag, but visit children.
xtof 2017/05/04 18:26:18 I *think* this will cause the content of <script>
nodir 2017/05/04 22:11:00 indeed, i should not print contents of <script> an
+ visitChildren(n)
+ }
+ }
+ visit(root)
+ return writeError
+}
« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | common/data/text/sanitizehtml/sanitize_test.go » ('J')

Powered by Google App Engine
This is Rietveld 408576698