Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(19)

Side by Side Diff: common/data/text/sanitizehtml/sanitize.go

Issue 2849353002: sanitizehtml: add a package to sanitize HTML (Closed)
Patch Set: add test Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2017 The LUCI Authors. All rights reserved.
2 // Use of this source code is governed under the Apache License, Version 2.0
3 // that can be found in the LICENSE file.
4
5 // Package sanitizehtml implements a sanitizer of a very limited HTML.
6 // See Sanitize comment.
7 package sanitizehtml
8
9 import (
10 "io"
11 "net/url"
12 "strings"
13 "unicode"
14
15 "golang.org/x/net/html"
16 )
17
18 // attrValueSanitizer sanitizes an attribute value.
19 // If flagged was returned as true, the value was possibly harmful,
20 // i.e. possibly it is an attack.
21 type attrValueSanitizer func(value string) (flagged bool, safeValue string)
22
23 func alwaysSafe(s string) (bool, string) {
24 return false, s
25 }
26
27 func sanitizeNum(s string) (bool, string) {
28 for i, r := range s {
29 if !unicode.IsDigit(r) {
nigeltao1 2017/05/04 23:44:23 I'd just look for ASCII digits. It's not like you'
nodir 2017/05/05 05:23:07 Done.
30 // ignore r and the rest.
31 return false, s[:i]
32 }
33 }
34 return false, s
35 }
36
37 func sanitizeURL(s string) (bool, string) {
38 switch u, err := url.Parse(s); {
39 case err != nil:
40 return false, "#invalid-url-stripped"
xtof 2017/05/05 15:29:16 It would be preferable to use about:invalid#reason
nodir 2017/05/05 16:10:06 Done. I was doing what Gitiles is doing: https://g
41 case strings.EqualFold(u.Scheme, "javascript"):
42 return true, "#non-http-or-https-url-stripped"
43 case u.Scheme != "http" && u.Scheme != "https":
44 return false, "#non-http-or-https-url-stripped"
45 case u.Host == "":
46 return false, "#relative-url-stripped"
47 default:
48 // re-serialize the URL to ensure that what we return is what we think
49 // we parsed.
50 return false, u.String()
51 }
52 }
53
54 type attrMap map[string]attrValueSanitizer
55
56 var (
57 anchorAttrs = attrMap{
58 "alt": alwaysSafe,
59 "href": sanitizeURL,
60 }
61 trAttrs = attrMap{
62 "rowspan": sanitizeNum,
xtof 2017/05/05 15:29:16 If you want to remove a little code, feel free to
nodir 2017/05/05 16:10:06 Done. I am always for deleting code.
63 "colspan": sanitizeNum,
64 }
65 tdAttrs = attrMap{
66 "rowspan": sanitizeNum,
67 "colspan": sanitizeNum,
68 }
69 )
70
71 type sanitizer struct {
72 w io.Writer
nigeltao1 2017/05/04 23:44:23 If you care enough about efficiency (both in terms
nodir 2017/05/05 05:23:07 Done.
73 err error
74 flagged bool
75 }
76
77 // p prints the text, unless there was an error before.
78 func (s *sanitizer) p(safeMarkup string) {
79 if s.err == nil {
80 _, s.err = io.WriteString(s.w, safeMarkup)
81 }
82 }
83
84 // printAttrs sanitizes and prints a whitelist of attributes in el
85 func (s *sanitizer) printAttrs(el *html.Node, whitelist attrMap) {
86 for _, a := range el.Attr {
87 key := strings.ToLower(a.Key)
88 if sanitizer, ok := whitelist[key]; a.Namespace == "" && ok {
89 s.p(" ")
90 s.p(key)
91 s.p("=\"")
92 flagged, safeValue := sanitizer(a.Val)
93 if flagged {
94 s.flagged = true
95 }
96 s.p(html.EscapeString(safeValue))
97 s.p("\"")
98 }
99 }
100 }
101
102 // printElem prints the safe element with a whitelist of attributes.
103 // If allowedAttrs is nil, all attributes are ommitted.
nigeltao1 2017/05/04 23:44:23 Typo in "omitted".
nodir 2017/05/05 05:23:07 Done.
104 //
105 // Do not call for unsafe elements.
106 func (s *sanitizer) printElem(safeElement *html.Node, allowedAttrs attrMap) {
107 tag := strings.ToLower(safeElement.Data)
108 s.p("<")
109 s.p(tag)
110 if allowedAttrs == nil {
111 // ignore attributes
112 } else {
113 s.printAttrs(safeElement, allowedAttrs)
114 }
115 s.p(">")
116
117 s.visitChildren(safeElement)
118
119 s.p("</")
120 s.p(tag)
121 s.p(">")
122 }
123
124 func (s *sanitizer) visit(n *html.Node) {
125 switch n.Type {
126 case html.TextNode:
127 // print it escaped.
128 s.p(html.EscapeString(n.Data))
129
130 case html.ElementNode:
131 // This switch statement defines what HTML elements we allow.
132 switch strings.ToLower(n.Data) {
nigeltao1 2017/05/04 23:44:23 The ToLower'ing is unnecessary if you compare atom
nodir 2017/05/05 05:23:07 nice, thanks, done
133 case "br":
134 // br is allowed and it should not be closed
135 s.p("<br>")
136
137 case "script":
138 // ignore entirely
139 // do not visit children so we don't print inner text
140 s.flagged = true
141
142 case "style":
143 // ignore entirely
144 // do not visit children so we don't print inner text
145
146 case "a":
147 s.p(`<a rel="noopener" target="_blank"`)
148 s.printAttrs(n, anchorAttrs)
149 s.p(">")
150 s.visitChildren(n)
151 s.p("</a>")
152
153 case "p", "ol", "ul", "li", "table", "strong", "em":
154 // print without attributes
155 s.printElem(n, nil)
156
157 case "tr":
158 s.printElem(n, trAttrs)
159
160 case "td":
161 s.printElem(n, tdAttrs)
162
163 default:
164 // ignore the element, but visit children.
165 s.visitChildren(n)
166 }
167
168 default:
169 // ignore the node, but visit children.
170 s.visitChildren(n)
171 }
172 }
173
174 func (s *sanitizer) visitChildren(n *html.Node) {
175 for c := n.FirstChild; c != nil; c = c.NextSibling {
176 s.visit(c)
177 }
178 }
179
180 // Sanitize strips all HTML nodes except allowed ones.
181 //
182 // Unless explicitly specified, attributes are stripped.
183 // Allowed elements:
184 // - p, br
185 // - strong, em
186 // - a
187 // - if href attribute is not a valid absolute HTTP(s) link, it is replaced
188 // with a innocuous fragment-only link.
189 // - alt attribute is allowed
190 // - ul, ol, li
191 // - table
192 // - tr, td. Attributes rowspan/colspan are allowed, but if a value contains a
193 // non-digit character, the character and the rest of the value is stripped.
194 //
195 // Elements <script> and <style> are ignored entirely.
196 // For all other HTML nodes, Sanitize ignores the node, but visits its children.
197 //
198 // The returned value flagged, if true, means that the input HTML was possibly
199 // harmful and advised to be logged.
xtof 2017/05/05 15:29:16 There are many other ways HTML can execute script
nodir 2017/05/05 16:10:06 Removed
200 func Sanitize(r io.Reader, w io.Writer) (flagged bool, err error) {
201 var root *html.Node
202 root, err = html.Parse(r)
203 if err != nil {
204 return
205 }
206
207 s := sanitizer{w: w}
208 s.visit(root)
209 return s.flagged, s.err
210 }
OLDNEW
« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698