Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2017 The LUCI Authors. All rights reserved. | |
| 2 // Use of this source code is governed under the Apache License, Version 2.0 | |
| 3 // that can be found in the LICENSE file. | |
| 4 | |
| 5 // Package sanitizehtml implements a sanitizer of a very limited HTML. | |
| 6 // See Sanitize comment. | |
| 7 package sanitizehtml | |
| 8 | |
| 9 import ( | |
| 10 "io" | |
| 11 "net/url" | |
| 12 "strings" | |
| 13 "unicode" | |
| 14 | |
| 15 "golang.org/x/net/html" | |
| 16 ) | |
| 17 | |
| 18 // attrValueSanitizer sanitizes an attribute value. | |
| 19 // If flagged was returned as true, the value was possibly harmful, | |
| 20 // i.e. possibly it is an attack. | |
| 21 type attrValueSanitizer func(value string) (flagged bool, safeValue string) | |
| 22 | |
| 23 func alwaysSafe(s string) (bool, string) { | |
| 24 return false, s | |
| 25 } | |
| 26 | |
| 27 func sanitizeNum(s string) (bool, string) { | |
| 28 for i, r := range s { | |
| 29 if !unicode.IsDigit(r) { | |
|
nigeltao1
2017/05/04 23:44:23
I'd just look for ASCII digits. It's not like you'
nodir
2017/05/05 05:23:07
Done.
| |
| 30 // ignore r and the rest. | |
| 31 return false, s[:i] | |
| 32 } | |
| 33 } | |
| 34 return false, s | |
| 35 } | |
| 36 | |
| 37 func sanitizeURL(s string) (bool, string) { | |
| 38 switch u, err := url.Parse(s); { | |
| 39 case err != nil: | |
| 40 return false, "#invalid-url-stripped" | |
|
xtof
2017/05/05 15:29:16
It would be preferable to use about:invalid#reason
nodir
2017/05/05 16:10:06
Done. I was doing what Gitiles is doing: https://g
| |
| 41 case strings.EqualFold(u.Scheme, "javascript"): | |
| 42 return true, "#non-http-or-https-url-stripped" | |
| 43 case u.Scheme != "http" && u.Scheme != "https": | |
| 44 return false, "#non-http-or-https-url-stripped" | |
| 45 case u.Host == "": | |
| 46 return false, "#relative-url-stripped" | |
| 47 default: | |
| 48 // re-serialize the URL to ensure that what we return is what we think | |
| 49 // we parsed. | |
| 50 return false, u.String() | |
| 51 } | |
| 52 } | |
| 53 | |
| 54 type attrMap map[string]attrValueSanitizer | |
| 55 | |
| 56 var ( | |
| 57 anchorAttrs = attrMap{ | |
| 58 "alt": alwaysSafe, | |
| 59 "href": sanitizeURL, | |
| 60 } | |
| 61 trAttrs = attrMap{ | |
| 62 "rowspan": sanitizeNum, | |
|
xtof
2017/05/05 15:29:16
If you want to remove a little code, feel free to
nodir
2017/05/05 16:10:06
Done. I am always for deleting code.
| |
| 63 "colspan": sanitizeNum, | |
| 64 } | |
| 65 tdAttrs = attrMap{ | |
| 66 "rowspan": sanitizeNum, | |
| 67 "colspan": sanitizeNum, | |
| 68 } | |
| 69 ) | |
| 70 | |
| 71 type sanitizer struct { | |
| 72 w io.Writer | |
|
nigeltao1
2017/05/04 23:44:23
If you care enough about efficiency (both in terms
nodir
2017/05/05 05:23:07
Done.
| |
| 73 err error | |
| 74 flagged bool | |
| 75 } | |
| 76 | |
| 77 // p prints the text, unless there was an error before. | |
| 78 func (s *sanitizer) p(safeMarkup string) { | |
| 79 if s.err == nil { | |
| 80 _, s.err = io.WriteString(s.w, safeMarkup) | |
| 81 } | |
| 82 } | |
| 83 | |
| 84 // printAttrs sanitizes and prints a whitelist of attributes in el | |
| 85 func (s *sanitizer) printAttrs(el *html.Node, whitelist attrMap) { | |
| 86 for _, a := range el.Attr { | |
| 87 key := strings.ToLower(a.Key) | |
| 88 if sanitizer, ok := whitelist[key]; a.Namespace == "" && ok { | |
| 89 s.p(" ") | |
| 90 s.p(key) | |
| 91 s.p("=\"") | |
| 92 flagged, safeValue := sanitizer(a.Val) | |
| 93 if flagged { | |
| 94 s.flagged = true | |
| 95 } | |
| 96 s.p(html.EscapeString(safeValue)) | |
| 97 s.p("\"") | |
| 98 } | |
| 99 } | |
| 100 } | |
| 101 | |
| 102 // printElem prints the safe element with a whitelist of attributes. | |
| 103 // If allowedAttrs is nil, all attributes are ommitted. | |
|
nigeltao1
2017/05/04 23:44:23
Typo in "omitted".
nodir
2017/05/05 05:23:07
Done.
| |
| 104 // | |
| 105 // Do not call for unsafe elements. | |
| 106 func (s *sanitizer) printElem(safeElement *html.Node, allowedAttrs attrMap) { | |
| 107 tag := strings.ToLower(safeElement.Data) | |
| 108 s.p("<") | |
| 109 s.p(tag) | |
| 110 if allowedAttrs == nil { | |
| 111 // ignore attributes | |
| 112 } else { | |
| 113 s.printAttrs(safeElement, allowedAttrs) | |
| 114 } | |
| 115 s.p(">") | |
| 116 | |
| 117 s.visitChildren(safeElement) | |
| 118 | |
| 119 s.p("</") | |
| 120 s.p(tag) | |
| 121 s.p(">") | |
| 122 } | |
| 123 | |
| 124 func (s *sanitizer) visit(n *html.Node) { | |
| 125 switch n.Type { | |
| 126 case html.TextNode: | |
| 127 // print it escaped. | |
| 128 s.p(html.EscapeString(n.Data)) | |
| 129 | |
| 130 case html.ElementNode: | |
| 131 // This switch statement defines what HTML elements we allow. | |
| 132 switch strings.ToLower(n.Data) { | |
|
nigeltao1
2017/05/04 23:44:23
The ToLower'ing is unnecessary if you compare atom
nodir
2017/05/05 05:23:07
nice, thanks, done
| |
| 133 case "br": | |
| 134 // br is allowed and it should not be closed | |
| 135 s.p("<br>") | |
| 136 | |
| 137 case "script": | |
| 138 // ignore entirely | |
| 139 // do not visit children so we don't print inner text | |
| 140 s.flagged = true | |
| 141 | |
| 142 case "style": | |
| 143 // ignore entirely | |
| 144 // do not visit children so we don't print inner text | |
| 145 | |
| 146 case "a": | |
| 147 s.p(`<a rel="noopener" target="_blank"`) | |
| 148 s.printAttrs(n, anchorAttrs) | |
| 149 s.p(">") | |
| 150 s.visitChildren(n) | |
| 151 s.p("</a>") | |
| 152 | |
| 153 case "p", "ol", "ul", "li", "table", "strong", "em": | |
| 154 // print without attributes | |
| 155 s.printElem(n, nil) | |
| 156 | |
| 157 case "tr": | |
| 158 s.printElem(n, trAttrs) | |
| 159 | |
| 160 case "td": | |
| 161 s.printElem(n, tdAttrs) | |
| 162 | |
| 163 default: | |
| 164 // ignore the element, but visit children. | |
| 165 s.visitChildren(n) | |
| 166 } | |
| 167 | |
| 168 default: | |
| 169 // ignore the node, but visit children. | |
| 170 s.visitChildren(n) | |
| 171 } | |
| 172 } | |
| 173 | |
| 174 func (s *sanitizer) visitChildren(n *html.Node) { | |
| 175 for c := n.FirstChild; c != nil; c = c.NextSibling { | |
| 176 s.visit(c) | |
| 177 } | |
| 178 } | |
| 179 | |
| 180 // Sanitize strips all HTML nodes except allowed ones. | |
| 181 // | |
| 182 // Unless explicitly specified, attributes are stripped. | |
| 183 // Allowed elements: | |
| 184 // - p, br | |
| 185 // - strong, em | |
| 186 // - a | |
| 187 // - if href attribute is not a valid absolute HTTP(s) link, it is replaced | |
| 188 // with a innocuous fragment-only link. | |
| 189 // - alt attribute is allowed | |
| 190 // - ul, ol, li | |
| 191 // - table | |
| 192 // - tr, td. Attributes rowspan/colspan are allowed, but if a value contains a | |
| 193 // non-digit character, the character and the rest of the value is stripped. | |
| 194 // | |
| 195 // Elements <script> and <style> are ignored entirely. | |
| 196 // For all other HTML nodes, Sanitize ignores the node, but visits its children. | |
| 197 // | |
| 198 // The returned value flagged, if true, means that the input HTML was possibly | |
| 199 // harmful and advised to be logged. | |
|
xtof
2017/05/05 15:29:16
There are many other ways HTML can execute script
nodir
2017/05/05 16:10:06
Removed
| |
| 200 func Sanitize(r io.Reader, w io.Writer) (flagged bool, err error) { | |
| 201 var root *html.Node | |
| 202 root, err = html.Parse(r) | |
| 203 if err != nil { | |
| 204 return | |
| 205 } | |
| 206 | |
| 207 s := sanitizer{w: w} | |
| 208 s.visit(root) | |
| 209 return s.flagged, s.err | |
| 210 } | |
| OLD | NEW |