Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(29)

Side by Side Diff: common/data/text/sanitizehtml/sanitize.go

Issue 2849353002: sanitizehtml: add a package to sanitize HTML (Closed)
Patch Set: nit Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2017 The LUCI Authors. All rights reserved.
2 // Use of this source code is governed under the Apache License, Version 2.0
3 // that can be found in the LICENSE file.
4
5 // Package sanitizehtml implements a sanitizer of a very limited HTML.
Vadim Sh. 2017/05/02 00:16:24 it would be great to elaborate what exact subset o
nodir 2017/05/04 22:11:00 done, in Sanitize comment
6 package sanitizehtml
7
8 import (
9 "io"
10 "net/url"
11 "strings"
12 "unicode"
13
14 "golang.org/x/net/html"
15 )
16
17 // attrValueSanitizer sanitizes an attribute value.
18 type attrValueSanitizer func(string) string
19
20 func alwaysSafe(s string) string {
21 return s
22 }
23
24 func sanitizeNum(s string) string {
25 for i, r := range s {
26 if !unicode.IsDigit(r) {
27 // ignore r and the rest.
28 return s[:i]
29 }
30 }
31 return s
32 }
33
34 func sanitizeURL(s string) string {
35 switch u, err := url.Parse(s); {
36 case err != nil:
37 return "#invalid-url-stripped"
38 case u.Scheme != "http" && u.Scheme != "https":
39 return "#non-http-or-https-url-stripped"
40 case u.Host == "":
41 return "#relative-url-stripped"
42 default:
43 return s
hinoka 2017/05/02 00:36:15 return u.EscapedPath() seems safer. It'll force-es
xtof 2017/05/04 18:26:18 Not-properly escaped query parameters are generall
nodir 2017/05/04 22:11:00 Done.
44 }
45 }
46
47 // allowedElems defines allowed HTML elements.
48 // A key is a lower-case element name and a value defines allowed attributes.
49 // The latter is mapping from lower case attribute name to a value sanitizer.
50 // If the map is nil, no attributes are allowed.
51 var allowedElems = map[string]map[string]attrValueSanitizer{
52 "a": {
Vadim Sh. 2017/05/02 00:16:24 may be a good idea to mandatory add rel="noopener"
nodir 2017/05/04 22:11:00 Done.
53 "alt": alwaysSafe,
54 "href": sanitizeURL,
55 },
56
57 "br": nil,
58 "p": nil,
59
60 "ol": nil,
61 "ul": nil,
62 "li": nil,
63
64 "strong": nil,
65 "em": nil,
66
67 "table": nil,
68 "tr": {
69 "rowspan": sanitizeNum,
70 "colspan": sanitizeNum,
71 },
72 "td": {
73 "rowspan": sanitizeNum,
74 "colspan": sanitizeNum,
75 },
76 }
77
78 // Sanitize strips all HTML nodes except
79 // - paragraphs
80 // - strong/emphasized text
81 // - valid absolute HTTP(S) links with optional alt.
82 // - ordered/unordered lists
83 // - tables with optional colspan/rowspan.
84 // Anything else is stripped.
85 func Sanitize(r io.Reader, w io.Writer) error {
86 root, err := html.Parse(r)
87 if err != nil {
88 return err
89 }
90
91 var writeError error
92 write := func(s string) {
93 if writeError == nil {
94 _, writeError = io.WriteString(w, s)
95 }
96 }
97
98 var visit func(n *html.Node)
99 visitChildren := func(n *html.Node) {
100 for c := n.FirstChild; c != nil; c = c.NextSibling {
101 visit(c)
102 }
103 }
104 visit = func(n *html.Node) {
105 switch n.Type {
106 case html.TextNode:
107 // print it escaped.
108 write(html.EscapeString(n.Data))
109
110 case html.ElementNode:
111 tag := strings.ToLower(n.Data)
112 if tag == "br" {
113 // br is allowed and it should not be closed
114 write("<br>")
115 return
116 }
117
118 allowedAttrs, allowed := allowedElems[tag]
119 if !allowed {
120 // ignore the tag, but visit children.
121 visitChildren(n)
122 return
123 }
124
125 // this tag is allowed.
126
127 write("<")
128 write(tag)
129 if allowedAttrs == nil {
130 // ignore attributes
131 } else {
132 // write only the allowed attributes.
133 for _, a := range n.Attr {
134 key := strings.ToLower(a.Key)
135 if sanitizer, ok := allowedAttrs[key]; a .Namespace == "" && ok {
hinoka 2017/05/02 00:36:15 Using template/html execute might be better here b
xtof 2017/05/04 18:26:18 I think what's here is fine, and (much) simpler.
nodir 2017/05/04 22:11:00 Acknowledged.
136 write(" ")
137 write(key)
138 write("=\"")
139 write(html.EscapeString(sanitize r(a.Val)))
140 write("\"")
141 }
142 }
143 }
144 write(">")
145
146 visitChildren(n)
147
148 write("</")
149 write(n.Data)
xtof 2017/05/04 18:26:18 write(tag) would be slightly more "obviously safe"
nodir 2017/05/04 22:11:01 I've changed it a bit. I hope it is still OK becau
150 write(">")
151
152 default:
153 // ignore the tag, but visit children.
xtof 2017/05/04 18:26:18 I *think* this will cause the content of <script>
nodir 2017/05/04 22:11:00 indeed, i should not print contents of <script> an
154 visitChildren(n)
155 }
156 }
157 visit(root)
158 return writeError
159 }
OLDNEW
« no previous file with comments | « no previous file | common/data/text/sanitizehtml/sanitize_test.go » ('j') | common/data/text/sanitizehtml/sanitize_test.go » ('J')

Powered by Google App Engine
This is Rietveld 408576698