Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(333)

Side by Side Diff: common/data/text/sanitizehtml/sanitize.go

Issue 2849353002: sanitizehtml: add a package to sanitize HTML (Closed)
Patch Set: malformed Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2017 The LUCI Authors. All rights reserved.
2 // Use of this source code is governed under the Apache License, Version 2.0
3 // that can be found in the LICENSE file.
4
5 // Package sanitizehtml implements a sanitizer of a very limited HTML.
6 // See Sanitize comment.
7 package sanitizehtml
8
9 import (
10 "bufio"
11 "io"
12 "net/url"
13 "strings"
14
15 "golang.org/x/net/html"
16 "golang.org/x/net/html/atom"
17 )
18
19 // attrValueSanitizer sanitizes an attribute value.
20 type attrValueSanitizer func(string) string
21
22 func alwaysSafe(s string) string {
23 return s
24 }
25
26 func sanitizeURL(s string) string {
27 u, err := url.Parse(s)
28 invalidityReason := ""
29 switch {
30 case err != nil:
31 invalidityReason = "url-is-malformed"
xtof 2017/05/09 16:06:21 Nit: After the transformation, the URL is no longe
nodir 2017/05/10 06:48:48 Done.
32
33 case u.Scheme != "http" && u.Scheme != "https":
34 invalidityReason = "url-is-not-http-or-https"
xtof 2017/05/09 16:06:21 Instead of the indirection via invalidityReason, i
nodir 2017/05/10 06:48:48 yeah, i like this more. done
35
36 case u.Host == "":
37 invalidityReason = "url-is-relative"
38 }
39 if invalidityReason != "" {
40 return "about:invalid#" + invalidityReason
41 }
42
43 // re-serialize the URL to ensure that what we return is what we think
44 // we parsed.
45 return u.String()
46 }
47
48 type attrMap map[string]attrValueSanitizer
49
50 var (
51 anchorAttrs = attrMap{
52 "alt": alwaysSafe,
53 "href": sanitizeURL,
54 }
55 trAttrs = attrMap{
56 "rowspan": alwaysSafe,
57 "colspan": alwaysSafe,
58 }
59 tdAttrs = attrMap{
60 "rowspan": alwaysSafe,
61 "colspan": alwaysSafe,
62 }
63 )
64
65 type stringWriter interface {
66 WriteString(string) (int, error)
67 }
68
69 type sanitizer struct {
70 sw stringWriter
71 err error
72 }
73
74 // p prints the text, unless there was an error before.
75 func (s *sanitizer) p(safeMarkup string) {
76 if s.err == nil {
77 _, s.err = s.sw.WriteString(safeMarkup)
78 }
79 }
80
81 // printAttrs sanitizes and prints a whitelist of attributes in el
82 func (s *sanitizer) printAttrs(el *html.Node, whitelist attrMap) {
83 for _, a := range el.Attr {
84 key := strings.ToLower(a.Key)
85 if sanitizer, ok := whitelist[key]; a.Namespace == "" && ok {
86 s.p(" ")
87 s.p(key)
88 s.p("=\"")
89 s.p(html.EscapeString(sanitizer(a.Val)))
90 s.p("\"")
91 }
92 }
93 }
94
95 // printElem prints the safe element with a whitelist of attributes.
96 // If allowedAttrs is nil, all attributes are omitted.
97 //
98 // Do not call for unsafe elements.
99 func (s *sanitizer) printElem(safeElement *html.Node, allowedAttrs attrMap) {
100 tag := safeElement.DataAtom.String()
101 s.p("<")
102 s.p(tag)
103 if allowedAttrs == nil {
104 // ignore attributes
105 } else {
106 s.printAttrs(safeElement, allowedAttrs)
107 }
108 s.p(">")
109
110 s.visitChildren(safeElement)
111
112 s.p("</")
113 s.p(tag)
114 s.p(">")
115 }
116
117 func (s *sanitizer) visit(n *html.Node) {
118 switch n.Type {
119 case html.TextNode:
120 // print it escaped.
121 s.p(html.EscapeString(n.Data))
122
123 case html.ElementNode:
124 // This switch statement defines what HTML elements we allow.
125 switch n.DataAtom {
126 case atom.Br:
127 // br is allowed and it should not be closed
128 s.p("<br>")
129
130 case atom.Script, atom.Style:
131 // ignore entirely
132 // do not visit children so we don't print inner text
133
134 case atom.A:
135 s.p(`<a rel="noopener" target="_blank"`)
136 s.printAttrs(n, anchorAttrs)
137 s.p(">")
138 s.visitChildren(n)
139 s.p("</a>")
140
141 case atom.P, atom.Ol, atom.Ul, atom.Li, atom.Table, atom.Strong, atom.Em:
142 // print without attributes
143 s.printElem(n, nil)
144
145 case atom.Tr:
146 s.printElem(n, trAttrs)
147
148 case atom.Td:
149 s.printElem(n, tdAttrs)
150
151 default:
152 // ignore the element, but visit children.
153 s.visitChildren(n)
154 }
155
156 default:
157 // ignore the node, but visit children.
158 s.visitChildren(n)
159 }
160 }
161
162 func (s *sanitizer) visitChildren(n *html.Node) {
163 for c := n.FirstChild; c != nil; c = c.NextSibling {
164 s.visit(c)
165 }
166 }
167
168 // Sanitize strips all HTML nodes except allowed ones.
169 //
170 // Unless explicitly specified, attributes are stripped.
171 // Allowed elements:
172 // - p, br
173 // - strong, em
174 // - a
175 // - if href attribute is not a valid absolute HTTP(s) link, it is replaced
176 // with a innocuous fragment-only link.
177 // - alt attribute is allowed
178 // - ul, ol, li
179 // - table
180 // - tr, td. Attributes rowspan/colspan are allowed, but if a value contains a
181 // non-digit character, the character and the rest of the value is stripped.
182 //
183 // Elements <script> and <style> are ignored entirely.
184 // For all other HTML nodes, Sanitize ignores the node, but visits its children.
185 func Sanitize(r io.Reader, w io.Writer) (err error) {
nigeltao1 2017/05/10 02:27:20 The general Go style is destination args before so
nodir 2017/05/10 06:48:48 Done.
186 var root *html.Node
187 root, err = html.Parse(r)
188 if err != nil {
189 return err
190 }
191
192 sw, ok := w.(stringWriter)
193 if !ok {
194 bw := bufio.NewWriter(w)
195 defer func() {
196 ferr := bw.Flush()
197 if err == nil {
198 err = ferr
199 }
200 }()
201 sw = bw
202 }
203
204 s := sanitizer{sw: sw}
205 s.visit(root)
206 return s.err
207 }
OLDNEW
« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698