Chromium Code Reviews| Index: common/data/text/sanitizehtml/sanitize.go |
| diff --git a/common/data/text/sanitizehtml/sanitize.go b/common/data/text/sanitizehtml/sanitize.go |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..59c6cf0a3242afb2399336fa1735b55110cef327 |
| --- /dev/null |
| +++ b/common/data/text/sanitizehtml/sanitize.go |
| @@ -0,0 +1,210 @@ |
| +// Copyright 2017 The LUCI Authors. All rights reserved. |
| +// Use of this source code is governed under the Apache License, Version 2.0 |
| +// that can be found in the LICENSE file. |
| + |
| +// Package sanitizehtml implements a sanitizer of a very limited HTML. |
| +// See Sanitize comment. |
| +package sanitizehtml |
| + |
| +import ( |
| + "io" |
| + "net/url" |
| + "strings" |
| + "unicode" |
| + |
| + "golang.org/x/net/html" |
| +) |
| + |
| +// attrValueSanitizer sanitizes an attribute value. |
| +// If flagged was returned as true, the value was possibly harmful, |
| +// i.e. possibly it is an attack. |
| +type attrValueSanitizer func(value string) (flagged bool, safeValue string) |
| + |
| +func alwaysSafe(s string) (bool, string) { |
| + return false, s |
| +} |
| + |
| +func sanitizeNum(s string) (bool, string) { |
| + for i, r := range s { |
| + if !unicode.IsDigit(r) { |
|
nigeltao1
2017/05/04 23:44:23
I'd just look for ASCII digits. It's not like you'
nodir
2017/05/05 05:23:07
Done.
|
| + // ignore r and the rest. |
| + return false, s[:i] |
| + } |
| + } |
| + return false, s |
| +} |
| + |
| +func sanitizeURL(s string) (bool, string) { |
| + switch u, err := url.Parse(s); { |
| + case err != nil: |
| + return false, "#invalid-url-stripped" |
|
xtof
2017/05/05 15:29:16
It would be preferable to use about:invalid#reason
nodir
2017/05/05 16:10:06
Done. I was doing what Gitiles is doing: https://g
|
| + case strings.EqualFold(u.Scheme, "javascript"): |
| + return true, "#non-http-or-https-url-stripped" |
| + case u.Scheme != "http" && u.Scheme != "https": |
| + return false, "#non-http-or-https-url-stripped" |
| + case u.Host == "": |
| + return false, "#relative-url-stripped" |
| + default: |
| + // re-serialize the URL to ensure that what we return is what we think |
| + // we parsed. |
| + return false, u.String() |
| + } |
| +} |
| + |
| +type attrMap map[string]attrValueSanitizer |
| + |
| +var ( |
| + anchorAttrs = attrMap{ |
| + "alt": alwaysSafe, |
| + "href": sanitizeURL, |
| + } |
| + trAttrs = attrMap{ |
| + "rowspan": sanitizeNum, |
|
xtof
2017/05/05 15:29:16
If you want to remove a little code, feel free to
nodir
2017/05/05 16:10:06
Done. I am always for deleting code.
|
| + "colspan": sanitizeNum, |
| + } |
| + tdAttrs = attrMap{ |
| + "rowspan": sanitizeNum, |
| + "colspan": sanitizeNum, |
| + } |
| +) |
| + |
| +type sanitizer struct { |
| + w io.Writer |
|
nigeltao1
2017/05/04 23:44:23
If you care enough about efficiency (both in terms
nodir
2017/05/05 05:23:07
Done.
|
| + err error |
| + flagged bool |
| +} |
| + |
| +// p prints the text, unless there was an error before. |
| +func (s *sanitizer) p(safeMarkup string) { |
| + if s.err == nil { |
| + _, s.err = io.WriteString(s.w, safeMarkup) |
| + } |
| +} |
| + |
| +// printAttrs sanitizes and prints a whitelist of attributes in el |
| +func (s *sanitizer) printAttrs(el *html.Node, whitelist attrMap) { |
| + for _, a := range el.Attr { |
| + key := strings.ToLower(a.Key) |
| + if sanitizer, ok := whitelist[key]; a.Namespace == "" && ok { |
| + s.p(" ") |
| + s.p(key) |
| + s.p("=\"") |
| + flagged, safeValue := sanitizer(a.Val) |
| + if flagged { |
| + s.flagged = true |
| + } |
| + s.p(html.EscapeString(safeValue)) |
| + s.p("\"") |
| + } |
| + } |
| +} |
| + |
| +// printElem prints the safe element with a whitelist of attributes. |
| +// If allowedAttrs is nil, all attributes are ommitted. |
|
nigeltao1
2017/05/04 23:44:23
Typo in "omitted".
nodir
2017/05/05 05:23:07
Done.
|
| +// |
| +// Do not call for unsafe elements. |
| +func (s *sanitizer) printElem(safeElement *html.Node, allowedAttrs attrMap) { |
| + tag := strings.ToLower(safeElement.Data) |
| + s.p("<") |
| + s.p(tag) |
| + if allowedAttrs == nil { |
| + // ignore attributes |
| + } else { |
| + s.printAttrs(safeElement, allowedAttrs) |
| + } |
| + s.p(">") |
| + |
| + s.visitChildren(safeElement) |
| + |
| + s.p("</") |
| + s.p(tag) |
| + s.p(">") |
| +} |
| + |
| +func (s *sanitizer) visit(n *html.Node) { |
| + switch n.Type { |
| + case html.TextNode: |
| + // print it escaped. |
| + s.p(html.EscapeString(n.Data)) |
| + |
| + case html.ElementNode: |
| + // This switch statement defines what HTML elements we allow. |
| + switch strings.ToLower(n.Data) { |
|
nigeltao1
2017/05/04 23:44:23
The ToLower'ing is unnecessary if you compare atom
nodir
2017/05/05 05:23:07
nice, thanks, done
|
| + case "br": |
| + // br is allowed and it should not be closed |
| + s.p("<br>") |
| + |
| + case "script": |
| + // ignore entirely |
| + // do not visit children so we don't print inner text |
| + s.flagged = true |
| + |
| + case "style": |
| + // ignore entirely |
| + // do not visit children so we don't print inner text |
| + |
| + case "a": |
| + s.p(`<a rel="noopener" target="_blank"`) |
| + s.printAttrs(n, anchorAttrs) |
| + s.p(">") |
| + s.visitChildren(n) |
| + s.p("</a>") |
| + |
| + case "p", "ol", "ul", "li", "table", "strong", "em": |
| + // print without attributes |
| + s.printElem(n, nil) |
| + |
| + case "tr": |
| + s.printElem(n, trAttrs) |
| + |
| + case "td": |
| + s.printElem(n, tdAttrs) |
| + |
| + default: |
| + // ignore the element, but visit children. |
| + s.visitChildren(n) |
| + } |
| + |
| + default: |
| + // ignore the node, but visit children. |
| + s.visitChildren(n) |
| + } |
| +} |
| + |
| +func (s *sanitizer) visitChildren(n *html.Node) { |
| + for c := n.FirstChild; c != nil; c = c.NextSibling { |
| + s.visit(c) |
| + } |
| +} |
| + |
| +// Sanitize strips all HTML nodes except allowed ones. |
| +// |
| +// Unless explicitly specified, attributes are stripped. |
| +// Allowed elements: |
| +// - p, br |
| +// - strong, em |
| +// - a |
| +// - if href attribute is not a valid absolute HTTP(s) link, it is replaced |
| +// with a innocuous fragment-only link. |
| +// - alt attribute is allowed |
| +// - ul, ol, li |
| +// - table |
| +// - tr, td. Attributes rowspan/colspan are allowed, but if a value contains a |
| +// non-digit character, the character and the rest of the value is stripped. |
| +// |
| +// Elements <script> and <style> are ignored entirely. |
| +// For all other HTML nodes, Sanitize ignores the node, but visits its children. |
| +// |
| +// The returned value flagged, if true, means that the input HTML was possibly |
| +// harmful and advised to be logged. |
|
xtof
2017/05/05 15:29:16
There are many other ways HTML can execute script
nodir
2017/05/05 16:10:06
Removed
|
| +func Sanitize(r io.Reader, w io.Writer) (flagged bool, err error) { |
| + var root *html.Node |
| + root, err = html.Parse(r) |
| + if err != nil { |
| + return |
| + } |
| + |
| + s := sanitizer{w: w} |
| + s.visit(root) |
| + return s.flagged, s.err |
| +} |