Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(427)

Side by Side Diff: common/data/text/sanitizehtml/sanitize.go

Issue 2873983003: sanitizehtml: disallow tables (Closed)
Patch Set: simplify Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2017 The LUCI Authors. All rights reserved. 1 // Copyright 2017 The LUCI Authors. All rights reserved.
2 // Use of this source code is governed under the Apache License, Version 2.0 2 // Use of this source code is governed under the Apache License, Version 2.0
3 // that can be found in the LICENSE file. 3 // that can be found in the LICENSE file.
4 4
5 // Package sanitizehtml implements a sanitizer of a very limited HTML. 5 // Package sanitizehtml implements a sanitizer of a very limited HTML.
6 // See Sanitize comment. 6 // See Sanitize comment.
7 package sanitizehtml 7 package sanitizehtml
8 8
9 import ( 9 import (
10 "bufio" 10 "bufio"
11 "io" 11 "io"
12 "net/url" 12 "net/url"
13 "strings" 13 "strings"
14 14
15 "golang.org/x/net/html" 15 "golang.org/x/net/html"
16 "golang.org/x/net/html/atom" 16 "golang.org/x/net/html/atom"
17 ) 17 )
18 18
19 // attrValueSanitizer sanitizes an attribute value.
20 type attrValueSanitizer func(string) string
21
22 func alwaysSafe(s string) string {
23 return s
24 }
25
26 func sanitizeURL(s string) string { 19 func sanitizeURL(s string) string {
27 const sanitizedPrefix = "about:invalid#sanitized&reason=" 20 const sanitizedPrefix = "about:invalid#sanitized&reason="
28 switch u, err := url.Parse(s); { 21 switch u, err := url.Parse(s); {
29 case err != nil: 22 case err != nil:
30 return sanitizedPrefix + "malformed-url" 23 return sanitizedPrefix + "malformed-url"
31 24
32 case u.Scheme != "http" && u.Scheme != "https": 25 case u.Scheme != "http" && u.Scheme != "https":
33 return sanitizedPrefix + "disallowed-scheme" 26 return sanitizedPrefix + "disallowed-scheme"
34 27
35 case u.Host == "": 28 case u.Host == "":
36 return sanitizedPrefix + "relative-url" 29 return sanitizedPrefix + "relative-url"
37 30
38 default: 31 default:
39 // re-serialize the URL to ensure that what we return is what we think 32 // re-serialize the URL to ensure that what we return is what we think
40 // we parsed. 33 // we parsed.
41 return u.String() 34 return u.String()
42 } 35 }
43 } 36 }
44 37
45 type attrMap map[string]attrValueSanitizer
46
47 var (
48 anchorAttrs = attrMap{
49 "alt": alwaysSafe,
50 "href": sanitizeURL,
51 }
52 trAttrs = attrMap{
53 "rowspan": alwaysSafe,
54 "colspan": alwaysSafe,
55 }
56 tdAttrs = attrMap{
57 "rowspan": alwaysSafe,
58 "colspan": alwaysSafe,
59 }
60 )
61
62 type stringWriter interface { 38 type stringWriter interface {
63 WriteString(string) (int, error) 39 WriteString(string) (int, error)
64 } 40 }
65 41
66 type sanitizer struct { 42 type sanitizer struct {
67 sw stringWriter 43 sw stringWriter
68 err error 44 err error
69 } 45 }
70 46
71 // p prints the text, unless there was an error before. 47 // p prints the text, unless there was an error before.
72 func (s *sanitizer) p(safeMarkup string) { 48 func (s *sanitizer) p(safeMarkup string) {
73 if s.err == nil { 49 if s.err == nil {
74 _, s.err = s.sw.WriteString(safeMarkup) 50 _, s.err = s.sw.WriteString(safeMarkup)
75 } 51 }
76 } 52 }
77 53
78 // printAttrs sanitizes and prints a whitelist of attributes in el 54 // printAttr prints a space and then an HTML attribute node.
79 func (s *sanitizer) printAttrs(el *html.Node, whitelist attrMap) { 55 func (s *sanitizer) printAttr(key, value string) {
80 » for _, a := range el.Attr { 56 » s.p(" ")
81 » » key := strings.ToLower(a.Key) 57 » s.p(key)
82 » » if sanitizer, ok := whitelist[key]; a.Namespace == "" && ok { 58 » s.p("=\"")
83 » » » s.p(" ") 59 » s.p(html.EscapeString(value))
84 » » » s.p(key) 60 » s.p("\"")
85 » » » s.p("=\"")
86 » » » s.p(html.EscapeString(sanitizer(a.Val)))
87 » » » s.p("\"")
88 » » }
89 » }
90 }
91
92 // printElem prints the safe element with a whitelist of attributes.
93 // If allowedAttrs is nil, all attributes are omitted.
94 //
95 // Do not call for unsafe elements.
96 func (s *sanitizer) printElem(safeElement *html.Node, allowedAttrs attrMap) {
97 » tag := safeElement.DataAtom.String()
98 » s.p("<")
99 » s.p(tag)
100 » if allowedAttrs == nil {
101 » » // ignore attributes
102 » } else {
103 » » s.printAttrs(safeElement, allowedAttrs)
104 » }
105 » s.p(">")
106
107 » s.visitChildren(safeElement)
108
109 » s.p("</")
110 » s.p(tag)
111 » s.p(">")
112 } 61 }
113 62
114 func (s *sanitizer) visit(n *html.Node) { 63 func (s *sanitizer) visit(n *html.Node) {
115 switch n.Type { 64 switch n.Type {
116 case html.TextNode: 65 case html.TextNode:
117 // print it escaped. 66 // print it escaped.
118 s.p(html.EscapeString(n.Data)) 67 s.p(html.EscapeString(n.Data))
119 68
120 case html.ElementNode: 69 case html.ElementNode:
121 // This switch statement defines what HTML elements we allow. 70 // This switch statement defines what HTML elements we allow.
122 switch n.DataAtom { 71 switch n.DataAtom {
123 case atom.Br: 72 case atom.Br:
124 // br is allowed and it should not be closed 73 // br is allowed and it should not be closed
125 s.p("<br>") 74 s.p("<br>")
126 75
127 case atom.Script, atom.Style: 76 case atom.Script, atom.Style:
128 // ignore entirely 77 // ignore entirely
129 // do not visit children so we don't print inner text 78 // do not visit children so we don't print inner text
130 79
131 case atom.A: 80 case atom.A:
132 s.p(`<a rel="noopener" target="_blank"`) 81 s.p(`<a rel="noopener" target="_blank"`)
133 » » » s.printAttrs(n, anchorAttrs) 82
83 » » » for _, a := range n.Attr {
84 » » » » if a.Namespace != "" {
85 » » » » » continue
86 » » » » }
87 » » » » switch strings.ToLower(a.Key) {
88 » » » » case "href":
89 » » » » » s.printAttr("href", sanitizeURL(a.Val))
90
91 » » » » case "alt":
92 » » » » » s.printAttr("alt", a.Val)
93 » » » » }
94 » » » }
95
134 s.p(">") 96 s.p(">")
135 s.visitChildren(n) 97 s.visitChildren(n)
136 s.p("</a>") 98 s.p("</a>")
137 99
138 » » case atom.P, atom.Ol, atom.Ul, atom.Li, atom.Table, atom.Strong, atom.Em: 100 » » case atom.P, atom.Ol, atom.Ul, atom.Li, atom.Strong, atom.Em:
139 // print without attributes 101 // print without attributes
140 » » » s.printElem(n, nil) 102 » » » tag := n.DataAtom.String()
103 » » » s.p("<")
104 » » » s.p(tag)
105 » » » s.p(">")
141 106
142 » » case atom.Tr: 107 » » » s.visitChildren(n)
143 » » » s.printElem(n, trAttrs)
144 108
145 » » case atom.Td: 109 » » » s.p("</")
146 » » » s.printElem(n, tdAttrs) 110 » » » s.p(tag)
111 » » » s.p(">")
147 112
148 default: 113 default:
149 // ignore the element, but visit children. 114 // ignore the element, but visit children.
150 s.visitChildren(n) 115 s.visitChildren(n)
151 } 116 }
152 117
153 default: 118 default:
154 // ignore the node, but visit children. 119 // ignore the node, but visit children.
155 s.visitChildren(n) 120 s.visitChildren(n)
156 } 121 }
157 } 122 }
158 123
159 func (s *sanitizer) visitChildren(n *html.Node) { 124 func (s *sanitizer) visitChildren(n *html.Node) {
160 for c := n.FirstChild; c != nil; c = c.NextSibling { 125 for c := n.FirstChild; c != nil; c = c.NextSibling {
161 s.visit(c) 126 s.visit(c)
162 } 127 }
163 } 128 }
164 129
165 // Sanitize strips all HTML nodes except allowed ones. 130 // Sanitize strips all HTML nodes except allowed ones.
166 // 131 //
167 // Unless explicitly specified, attributes are stripped. 132 // Unless explicitly specified, attributes are stripped.
168 // Allowed elements: 133 // Allowed elements:
169 // - p, br 134 // - p, br
170 // - strong, em 135 // - strong, em
171 // - a 136 // - a
172 // - if href attribute is not a valid absolute HTTP(s) link, it is replaced 137 // - if href attribute is not a valid absolute HTTP(s) link, it is replaced
173 // with an innocuous one. 138 // with an innocuous one.
174 // - alt attribute is allowed 139 // - alt attribute is allowed
175 // - ul, ol, li 140 // - ul, ol, li
176 // - table
177 // - tr, td. Attributes rowspan/colspan are allowed.
178 // 141 //
179 // Elements <script> and <style> are ignored entirely. 142 // Elements <script> and <style> are ignored entirely.
180 // For all other HTML nodes, Sanitize ignores the node, but visits its children. 143 // For all other HTML nodes, Sanitize ignores the node, but visits its children.
181 func Sanitize(w io.Writer, r io.Reader) (err error) { 144 func Sanitize(w io.Writer, r io.Reader) (err error) {
182 var root *html.Node 145 var root *html.Node
183 root, err = html.Parse(r) 146 root, err = html.Parse(r)
184 if err != nil { 147 if err != nil {
185 return err 148 return err
186 } 149 }
187 150
188 sw, ok := w.(stringWriter) 151 sw, ok := w.(stringWriter)
189 if !ok { 152 if !ok {
190 bw := bufio.NewWriter(w) 153 bw := bufio.NewWriter(w)
191 defer func() { 154 defer func() {
192 ferr := bw.Flush() 155 ferr := bw.Flush()
193 if err == nil { 156 if err == nil {
194 err = ferr 157 err = ferr
195 } 158 }
196 }() 159 }()
197 sw = bw 160 sw = bw
198 } 161 }
199 162
200 s := sanitizer{sw: sw} 163 s := sanitizer{sw: sw}
201 s.visit(root) 164 s.visit(root)
202 return s.err 165 return s.err
203 } 166 }
OLDNEW
« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698