| OLD | NEW |
| 1 // Copyright 2017 The LUCI Authors. All rights reserved. | 1 // Copyright 2017 The LUCI Authors. All rights reserved. |
| 2 // Use of this source code is governed under the Apache License, Version 2.0 | 2 // Use of this source code is governed under the Apache License, Version 2.0 |
| 3 // that can be found in the LICENSE file. | 3 // that can be found in the LICENSE file. |
| 4 | 4 |
| 5 // Package sanitizehtml implements a sanitizer of a very limited HTML. | 5 // Package sanitizehtml implements a sanitizer of a very limited HTML. |
| 6 // See Sanitize comment. | 6 // See Sanitize comment. |
| 7 package sanitizehtml | 7 package sanitizehtml |
| 8 | 8 |
| 9 import ( | 9 import ( |
| 10 "bufio" | 10 "bufio" |
| 11 "io" | 11 "io" |
| 12 "net/url" | 12 "net/url" |
| 13 "strings" | 13 "strings" |
| 14 | 14 |
| 15 "golang.org/x/net/html" | 15 "golang.org/x/net/html" |
| 16 "golang.org/x/net/html/atom" | 16 "golang.org/x/net/html/atom" |
| 17 ) | 17 ) |
| 18 | 18 |
| 19 // attrValueSanitizer sanitizes an attribute value. | |
| 20 type attrValueSanitizer func(string) string | |
| 21 | |
| 22 func alwaysSafe(s string) string { | |
| 23 return s | |
| 24 } | |
| 25 | |
| 26 func sanitizeURL(s string) string { | 19 func sanitizeURL(s string) string { |
| 27 const sanitizedPrefix = "about:invalid#sanitized&reason=" | 20 const sanitizedPrefix = "about:invalid#sanitized&reason=" |
| 28 switch u, err := url.Parse(s); { | 21 switch u, err := url.Parse(s); { |
| 29 case err != nil: | 22 case err != nil: |
| 30 return sanitizedPrefix + "malformed-url" | 23 return sanitizedPrefix + "malformed-url" |
| 31 | 24 |
| 32 case u.Scheme != "http" && u.Scheme != "https": | 25 case u.Scheme != "http" && u.Scheme != "https": |
| 33 return sanitizedPrefix + "disallowed-scheme" | 26 return sanitizedPrefix + "disallowed-scheme" |
| 34 | 27 |
| 35 case u.Host == "": | 28 case u.Host == "": |
| 36 return sanitizedPrefix + "relative-url" | 29 return sanitizedPrefix + "relative-url" |
| 37 | 30 |
| 38 default: | 31 default: |
| 39 // re-serialize the URL to ensure that what we return is what we
think | 32 // re-serialize the URL to ensure that what we return is what we
think |
| 40 // we parsed. | 33 // we parsed. |
| 41 return u.String() | 34 return u.String() |
| 42 } | 35 } |
| 43 } | 36 } |
| 44 | 37 |
| 45 type attrMap map[string]attrValueSanitizer | |
| 46 | |
| 47 var ( | |
| 48 anchorAttrs = attrMap{ | |
| 49 "alt": alwaysSafe, | |
| 50 "href": sanitizeURL, | |
| 51 } | |
| 52 trAttrs = attrMap{ | |
| 53 "rowspan": alwaysSafe, | |
| 54 "colspan": alwaysSafe, | |
| 55 } | |
| 56 tdAttrs = attrMap{ | |
| 57 "rowspan": alwaysSafe, | |
| 58 "colspan": alwaysSafe, | |
| 59 } | |
| 60 ) | |
| 61 | |
| 62 type stringWriter interface { | 38 type stringWriter interface { |
| 63 WriteString(string) (int, error) | 39 WriteString(string) (int, error) |
| 64 } | 40 } |
| 65 | 41 |
| 66 type sanitizer struct { | 42 type sanitizer struct { |
| 67 sw stringWriter | 43 sw stringWriter |
| 68 err error | 44 err error |
| 69 } | 45 } |
| 70 | 46 |
| 71 // p prints the text, unless there was an error before. | 47 // p prints the text, unless there was an error before. |
| 72 func (s *sanitizer) p(safeMarkup string) { | 48 func (s *sanitizer) p(safeMarkup string) { |
| 73 if s.err == nil { | 49 if s.err == nil { |
| 74 _, s.err = s.sw.WriteString(safeMarkup) | 50 _, s.err = s.sw.WriteString(safeMarkup) |
| 75 } | 51 } |
| 76 } | 52 } |
| 77 | 53 |
| 78 // printAttrs sanitizes and prints a whitelist of attributes in el | 54 // printAttr prints a space and then an HTML attribute node. |
| 79 func (s *sanitizer) printAttrs(el *html.Node, whitelist attrMap) { | 55 func (s *sanitizer) printAttr(key, value string) { |
| 80 » for _, a := range el.Attr { | 56 » s.p(" ") |
| 81 » » key := strings.ToLower(a.Key) | 57 » s.p(key) |
| 82 » » if sanitizer, ok := whitelist[key]; a.Namespace == "" && ok { | 58 » s.p("=\"") |
| 83 » » » s.p(" ") | 59 » s.p(html.EscapeString(value)) |
| 84 » » » s.p(key) | 60 » s.p("\"") |
| 85 » » » s.p("=\"") | |
| 86 » » » s.p(html.EscapeString(sanitizer(a.Val))) | |
| 87 » » » s.p("\"") | |
| 88 » » } | |
| 89 » } | |
| 90 } | |
| 91 | |
| 92 // printElem prints the safe element with a whitelist of attributes. | |
| 93 // If allowedAttrs is nil, all attributes are omitted. | |
| 94 // | |
| 95 // Do not call for unsafe elements. | |
| 96 func (s *sanitizer) printElem(safeElement *html.Node, allowedAttrs attrMap) { | |
| 97 » tag := safeElement.DataAtom.String() | |
| 98 » s.p("<") | |
| 99 » s.p(tag) | |
| 100 » if allowedAttrs == nil { | |
| 101 » » // ignore attributes | |
| 102 » } else { | |
| 103 » » s.printAttrs(safeElement, allowedAttrs) | |
| 104 » } | |
| 105 » s.p(">") | |
| 106 | |
| 107 » s.visitChildren(safeElement) | |
| 108 | |
| 109 » s.p("</") | |
| 110 » s.p(tag) | |
| 111 » s.p(">") | |
| 112 } | 61 } |
| 113 | 62 |
| 114 func (s *sanitizer) visit(n *html.Node) { | 63 func (s *sanitizer) visit(n *html.Node) { |
| 115 switch n.Type { | 64 switch n.Type { |
| 116 case html.TextNode: | 65 case html.TextNode: |
| 117 // print it escaped. | 66 // print it escaped. |
| 118 s.p(html.EscapeString(n.Data)) | 67 s.p(html.EscapeString(n.Data)) |
| 119 | 68 |
| 120 case html.ElementNode: | 69 case html.ElementNode: |
| 121 // This switch statement defines what HTML elements we allow. | 70 // This switch statement defines what HTML elements we allow. |
| 122 switch n.DataAtom { | 71 switch n.DataAtom { |
| 123 case atom.Br: | 72 case atom.Br: |
| 124 // br is allowed and it should not be closed | 73 // br is allowed and it should not be closed |
| 125 s.p("<br>") | 74 s.p("<br>") |
| 126 | 75 |
| 127 case atom.Script, atom.Style: | 76 case atom.Script, atom.Style: |
| 128 // ignore entirely | 77 // ignore entirely |
| 129 // do not visit children so we don't print inner text | 78 // do not visit children so we don't print inner text |
| 130 | 79 |
| 131 case atom.A: | 80 case atom.A: |
| 132 s.p(`<a rel="noopener" target="_blank"`) | 81 s.p(`<a rel="noopener" target="_blank"`) |
| 133 » » » s.printAttrs(n, anchorAttrs) | 82 |
| 83 » » » for _, a := range n.Attr { |
| 84 » » » » if a.Namespace != "" { |
| 85 » » » » » continue |
| 86 » » » » } |
| 87 » » » » switch strings.ToLower(a.Key) { |
| 88 » » » » case "href": |
| 89 » » » » » s.printAttr("href", sanitizeURL(a.Val)) |
| 90 |
| 91 » » » » case "alt": |
| 92 » » » » » s.printAttr("alt", a.Val) |
| 93 » » » » } |
| 94 » » » } |
| 95 |
| 134 s.p(">") | 96 s.p(">") |
| 135 s.visitChildren(n) | 97 s.visitChildren(n) |
| 136 s.p("</a>") | 98 s.p("</a>") |
| 137 | 99 |
| 138 » » case atom.P, atom.Ol, atom.Ul, atom.Li, atom.Table, atom.Strong,
atom.Em: | 100 » » case atom.P, atom.Ol, atom.Ul, atom.Li, atom.Strong, atom.Em: |
| 139 // print without attributes | 101 // print without attributes |
| 140 » » » s.printElem(n, nil) | 102 » » » tag := n.DataAtom.String() |
| 103 » » » s.p("<") |
| 104 » » » s.p(tag) |
| 105 » » » s.p(">") |
| 141 | 106 |
| 142 » » case atom.Tr: | 107 » » » s.visitChildren(n) |
| 143 » » » s.printElem(n, trAttrs) | |
| 144 | 108 |
| 145 » » case atom.Td: | 109 » » » s.p("</") |
| 146 » » » s.printElem(n, tdAttrs) | 110 » » » s.p(tag) |
| 111 » » » s.p(">") |
| 147 | 112 |
| 148 default: | 113 default: |
| 149 // ignore the element, but visit children. | 114 // ignore the element, but visit children. |
| 150 s.visitChildren(n) | 115 s.visitChildren(n) |
| 151 } | 116 } |
| 152 | 117 |
| 153 default: | 118 default: |
| 154 // ignore the node, but visit children. | 119 // ignore the node, but visit children. |
| 155 s.visitChildren(n) | 120 s.visitChildren(n) |
| 156 } | 121 } |
| 157 } | 122 } |
| 158 | 123 |
| 159 func (s *sanitizer) visitChildren(n *html.Node) { | 124 func (s *sanitizer) visitChildren(n *html.Node) { |
| 160 for c := n.FirstChild; c != nil; c = c.NextSibling { | 125 for c := n.FirstChild; c != nil; c = c.NextSibling { |
| 161 s.visit(c) | 126 s.visit(c) |
| 162 } | 127 } |
| 163 } | 128 } |
| 164 | 129 |
| 165 // Sanitize strips all HTML nodes except allowed ones. | 130 // Sanitize strips all HTML nodes except allowed ones. |
| 166 // | 131 // |
| 167 // Unless explicitly specified, attributes are stripped. | 132 // Unless explicitly specified, attributes are stripped. |
| 168 // Allowed elements: | 133 // Allowed elements: |
| 169 // - p, br | 134 // - p, br |
| 170 // - strong, em | 135 // - strong, em |
| 171 // - a | 136 // - a |
| 172 // - if href attribute is not a valid absolute HTTP(s) link, it is replaced | 137 // - if href attribute is not a valid absolute HTTP(s) link, it is replaced |
| 173 // with an innocuous one. | 138 // with an innocuous one. |
| 174 // - alt attribute is allowed | 139 // - alt attribute is allowed |
| 175 // - ul, ol, li | 140 // - ul, ol, li |
| 176 // - table | |
| 177 // - tr, td. Attributes rowspan/colspan are allowed. | |
| 178 // | 141 // |
| 179 // Elements <script> and <style> are ignored entirely. | 142 // Elements <script> and <style> are ignored entirely. |
| 180 // For all other HTML nodes, Sanitize ignores the node, but visits its children. | 143 // For all other HTML nodes, Sanitize ignores the node, but visits its children. |
| 181 func Sanitize(w io.Writer, r io.Reader) (err error) { | 144 func Sanitize(w io.Writer, r io.Reader) (err error) { |
| 182 var root *html.Node | 145 var root *html.Node |
| 183 root, err = html.Parse(r) | 146 root, err = html.Parse(r) |
| 184 if err != nil { | 147 if err != nil { |
| 185 return err | 148 return err |
| 186 } | 149 } |
| 187 | 150 |
| 188 sw, ok := w.(stringWriter) | 151 sw, ok := w.(stringWriter) |
| 189 if !ok { | 152 if !ok { |
| 190 bw := bufio.NewWriter(w) | 153 bw := bufio.NewWriter(w) |
| 191 defer func() { | 154 defer func() { |
| 192 ferr := bw.Flush() | 155 ferr := bw.Flush() |
| 193 if err == nil { | 156 if err == nil { |
| 194 err = ferr | 157 err = ferr |
| 195 } | 158 } |
| 196 }() | 159 }() |
| 197 sw = bw | 160 sw = bw |
| 198 } | 161 } |
| 199 | 162 |
| 200 s := sanitizer{sw: sw} | 163 s := sanitizer{sw: sw} |
| 201 s.visit(root) | 164 s.visit(root) |
| 202 return s.err | 165 return s.err |
| 203 } | 166 } |
| OLD | NEW |