Chromium Code Reviews| Index: common/data/text/sanitizehtml/sanitize.go |
| diff --git a/common/data/text/sanitizehtml/sanitize.go b/common/data/text/sanitizehtml/sanitize.go |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..0e03c8c6e751c41f1ce027d2658efb8cd9f78f7f |
| --- /dev/null |
| +++ b/common/data/text/sanitizehtml/sanitize.go |
| @@ -0,0 +1,159 @@ |
| +// Copyright 2017 The LUCI Authors. All rights reserved. |
| +// Use of this source code is governed under the Apache License, Version 2.0 |
| +// that can be found in the LICENSE file. |
| + |
| +// Package sanitizehtml implements a sanitizer of a very limited HTML. |
|
Vadim Sh.
2017/05/02 00:16:24
it would be great to elaborate what exact subset o
nodir
2017/05/04 22:11:00
done, in Sanitize comment
|
| +package sanitizehtml |
| + |
| +import ( |
| + "io" |
| + "net/url" |
| + "strings" |
| + "unicode" |
| + |
| + "golang.org/x/net/html" |
| +) |
| + |
| +// attrValueSanitizer sanitizes an attribute value. |
| +type attrValueSanitizer func(string) string |
| + |
| +func alwaysSafe(s string) string { |
| + return s |
| +} |
| + |
| +func sanitizeNum(s string) string { |
| + for i, r := range s { |
| + if !unicode.IsDigit(r) { |
| + // ignore r and the rest. |
| + return s[:i] |
| + } |
| + } |
| + return s |
| +} |
| + |
| +func sanitizeURL(s string) string { |
| + switch u, err := url.Parse(s); { |
| + case err != nil: |
| + return "#invalid-url-stripped" |
| + case u.Scheme != "http" && u.Scheme != "https": |
| + return "#non-http-or-https-url-stripped" |
| + case u.Host == "": |
| + return "#relative-url-stripped" |
| + default: |
| + return s |
|
hinoka
2017/05/02 00:36:15
return u.EscapedPath() seems safer. It'll force-es
xtof
2017/05/04 18:26:18
Not-properly escaped query parameters are generall
nodir
2017/05/04 22:11:00
Done.
|
| + } |
| +} |
| + |
| +// allowedElems defines allowed HTML elements. |
| +// A key is a lower-case element name and a value defines allowed attributes. |
| +// The latter is mapping from lower case attribute name to a value sanitizer. |
| +// If the map is nil, no attributes are allowed. |
| +var allowedElems = map[string]map[string]attrValueSanitizer{ |
| + "a": { |
|
Vadim Sh.
2017/05/02 00:16:24
may be a good idea to mandatory add rel="noopener"
nodir
2017/05/04 22:11:00
Done.
|
| + "alt": alwaysSafe, |
| + "href": sanitizeURL, |
| + }, |
| + |
| + "br": nil, |
| + "p": nil, |
| + |
| + "ol": nil, |
| + "ul": nil, |
| + "li": nil, |
| + |
| + "strong": nil, |
| + "em": nil, |
| + |
| + "table": nil, |
| + "tr": { |
| + "rowspan": sanitizeNum, |
| + "colspan": sanitizeNum, |
| + }, |
| + "td": { |
| + "rowspan": sanitizeNum, |
| + "colspan": sanitizeNum, |
| + }, |
| +} |
| + |
| +// Sanitize strips all HTML nodes except |
| +// - paragraphs |
| +// - strong/emphasized text |
| +// - valid absolute HTTP(S) links with optional alt. |
| +// - ordered/unordered lists |
| +// - tables with optional colspan/rowspan. |
| +// Anything else is stripped. |
| +func Sanitize(r io.Reader, w io.Writer) error { |
| + root, err := html.Parse(r) |
| + if err != nil { |
| + return err |
| + } |
| + |
| + var writeError error |
| + write := func(s string) { |
| + if writeError == nil { |
| + _, writeError = io.WriteString(w, s) |
| + } |
| + } |
| + |
| + var visit func(n *html.Node) |
| + visitChildren := func(n *html.Node) { |
| + for c := n.FirstChild; c != nil; c = c.NextSibling { |
| + visit(c) |
| + } |
| + } |
| + visit = func(n *html.Node) { |
| + switch n.Type { |
| + case html.TextNode: |
| + // print it escaped. |
| + write(html.EscapeString(n.Data)) |
| + |
| + case html.ElementNode: |
| + tag := strings.ToLower(n.Data) |
| + if tag == "br" { |
| + // br is allowed and it should not be closed |
| + write("<br>") |
| + return |
| + } |
| + |
| + allowedAttrs, allowed := allowedElems[tag] |
| + if !allowed { |
| + // ignore the tag, but visit children. |
| + visitChildren(n) |
| + return |
| + } |
| + |
| + // this tag is allowed. |
| + |
| + write("<") |
| + write(tag) |
| + if allowedAttrs == nil { |
| + // ignore attributes |
| + } else { |
| + // write only the allowed attributes. |
| + for _, a := range n.Attr { |
| + key := strings.ToLower(a.Key) |
| + if sanitizer, ok := allowedAttrs[key]; a.Namespace == "" && ok { |
|
hinoka
2017/05/02 00:36:15
Using template/html execute might be better here b
xtof
2017/05/04 18:26:18
I think what's here is fine, and (much) simpler.
nodir
2017/05/04 22:11:00
Acknowledged.
|
| + write(" ") |
| + write(key) |
| + write("=\"") |
| + write(html.EscapeString(sanitizer(a.Val))) |
| + write("\"") |
| + } |
| + } |
| + } |
| + write(">") |
| + |
| + visitChildren(n) |
| + |
| + write("</") |
| + write(n.Data) |
|
xtof
2017/05/04 18:26:18
write(tag) would be slightly more "obviously safe"
nodir
2017/05/04 22:11:01
I've changed it a bit. I hope it is still OK becau
|
| + write(">") |
| + |
| + default: |
| + // ignore the tag, but visit children. |
|
xtof
2017/05/04 18:26:18
I *think* this will cause the content of <script>
nodir
2017/05/04 22:11:00
indeed, i should not print contents of <script> an
|
| + visitChildren(n) |
| + } |
| + } |
| + visit(root) |
| + return writeError |
| +} |