Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(489)

Side by Side Diff: common/data/text/sanitizehtml/sanitize.go

Issue 2849353002: sanitizehtml: add a package to sanitize HTML (Closed)
Patch Set: fix comments Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2017 The LUCI Authors. All rights reserved.
2 // Use of this source code is governed under the Apache License, Version 2.0
3 // that can be found in the LICENSE file.
4
5 // Package sanitizehtml implements a sanitizer of a very limited HTML.
6 // See Sanitize comment.
7 package sanitizehtml
8
9 import (
10 "bufio"
11 "io"
12 "net/url"
13 "strings"
14
15 "golang.org/x/net/html"
16 "golang.org/x/net/html/atom"
17 )
18
19 // attrValueSanitizer sanitizes an attribute value.
20 type attrValueSanitizer func(string) string
21
22 func alwaysSafe(s string) string {
23 return s
24 }
25
26 func sanitizeURL(s string) string {
27 const sanitizedPrefix = "about:invalid#sanitized&reason="
28 switch u, err := url.Parse(s); {
29 case err != nil:
30 return sanitizedPrefix + "malformed-url"
31
32 case u.Scheme != "http" && u.Scheme != "https":
33 return sanitizedPrefix + "disallowed-scheme"
34
35 case u.Host == "":
36 return sanitizedPrefix + "relative-url"
37
38 default:
39 // re-serialize the URL to ensure that what we return is what we think
40 // we parsed.
41 return u.String()
42 }
43 }
44
45 type attrMap map[string]attrValueSanitizer
46
47 var (
48 anchorAttrs = attrMap{
49 "alt": alwaysSafe,
50 "href": sanitizeURL,
51 }
52 trAttrs = attrMap{
53 "rowspan": alwaysSafe,
54 "colspan": alwaysSafe,
55 }
56 tdAttrs = attrMap{
57 "rowspan": alwaysSafe,
58 "colspan": alwaysSafe,
59 }
60 )
61
62 type stringWriter interface {
63 WriteString(string) (int, error)
64 }
65
66 type sanitizer struct {
67 sw stringWriter
68 err error
69 }
70
71 // p prints the text, unless there was an error before.
72 func (s *sanitizer) p(safeMarkup string) {
73 if s.err == nil {
74 _, s.err = s.sw.WriteString(safeMarkup)
75 }
76 }
77
78 // printAttrs sanitizes and prints a whitelist of attributes in el
79 func (s *sanitizer) printAttrs(el *html.Node, whitelist attrMap) {
80 for _, a := range el.Attr {
81 key := strings.ToLower(a.Key)
82 if sanitizer, ok := whitelist[key]; a.Namespace == "" && ok {
83 s.p(" ")
84 s.p(key)
85 s.p("=\"")
86 s.p(html.EscapeString(sanitizer(a.Val)))
87 s.p("\"")
88 }
89 }
90 }
91
92 // printElem prints the safe element with a whitelist of attributes.
93 // If allowedAttrs is nil, all attributes are omitted.
94 //
95 // Do not call for unsafe elements.
96 func (s *sanitizer) printElem(safeElement *html.Node, allowedAttrs attrMap) {
97 tag := safeElement.DataAtom.String()
98 s.p("<")
99 s.p(tag)
100 if allowedAttrs == nil {
101 // ignore attributes
102 } else {
103 s.printAttrs(safeElement, allowedAttrs)
104 }
105 s.p(">")
106
107 s.visitChildren(safeElement)
108
109 s.p("</")
110 s.p(tag)
111 s.p(">")
112 }
113
114 func (s *sanitizer) visit(n *html.Node) {
115 switch n.Type {
116 case html.TextNode:
117 // print it escaped.
118 s.p(html.EscapeString(n.Data))
119
120 case html.ElementNode:
121 // This switch statement defines what HTML elements we allow.
122 switch n.DataAtom {
123 case atom.Br:
124 // br is allowed and it should not be closed
125 s.p("<br>")
126
127 case atom.Script, atom.Style:
128 // ignore entirely
129 // do not visit children so we don't print inner text
130
131 case atom.A:
132 s.p(`<a rel="noopener" target="_blank"`)
133 s.printAttrs(n, anchorAttrs)
134 s.p(">")
135 s.visitChildren(n)
136 s.p("</a>")
137
138 case atom.P, atom.Ol, atom.Ul, atom.Li, atom.Table, atom.Strong, atom.Em:
139 // print without attributes
140 s.printElem(n, nil)
141
142 case atom.Tr:
143 s.printElem(n, trAttrs)
144
145 case atom.Td:
146 s.printElem(n, tdAttrs)
147
148 default:
149 // ignore the element, but visit children.
150 s.visitChildren(n)
151 }
152
153 default:
154 // ignore the node, but visit children.
155 s.visitChildren(n)
156 }
157 }
158
159 func (s *sanitizer) visitChildren(n *html.Node) {
160 for c := n.FirstChild; c != nil; c = c.NextSibling {
161 s.visit(c)
162 }
163 }
164
165 // Sanitize strips all HTML nodes except allowed ones.
166 //
167 // Unless explicitly specified, attributes are stripped.
168 // Allowed elements:
169 // - p, br
170 // - strong, em
171 // - a
172 // - if href attribute is not a valid absolute HTTP(s) link, it is replaced
173 // with an innocuous one.
174 // - alt attribute is allowed
175 // - ul, ol, li
176 // - table
177 // - tr, td. Attributes rowspan/colspan are allowed.
178 //
179 // Elements <script> and <style> are ignored entirely.
180 // For all other HTML nodes, Sanitize ignores the node, but visits its children.
181 func Sanitize(w io.Writer, r io.Reader) (err error) {
182 var root *html.Node
183 root, err = html.Parse(r)
184 if err != nil {
185 return err
186 }
187
188 sw, ok := w.(stringWriter)
189 if !ok {
190 bw := bufio.NewWriter(w)
191 defer func() {
192 ferr := bw.Flush()
193 if err == nil {
194 err = ferr
195 }
196 }()
197 sw = bw
198 }
199
200 s := sanitizer{sw: sw}
201 s.visit(root)
202 return s.err
203 }
OLDNEW
« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698