Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2017 The LUCI Authors. All rights reserved. | |
| 2 // Use of this source code is governed under the Apache License, Version 2.0 | |
| 3 // that can be found in the LICENSE file. | |
| 4 | |
| 5 // Package sanitizehtml implements a sanitizer of a very limited HTML. | |
|
Vadim Sh.
2017/05/02 00:16:24
it would be great to elaborate what exact subset o
nodir
2017/05/04 22:11:00
done, in Sanitize comment
| |
| 6 package sanitizehtml | |
| 7 | |
| 8 import ( | |
| 9 "io" | |
| 10 "net/url" | |
| 11 "strings" | |
| 12 "unicode" | |
| 13 | |
| 14 "golang.org/x/net/html" | |
| 15 ) | |
| 16 | |
| 17 // attrValueSanitizer sanitizes an attribute value. | |
| 18 type attrValueSanitizer func(string) string | |
| 19 | |
| 20 func alwaysSafe(s string) string { | |
| 21 return s | |
| 22 } | |
| 23 | |
| 24 func sanitizeNum(s string) string { | |
| 25 for i, r := range s { | |
| 26 if !unicode.IsDigit(r) { | |
| 27 // ignore r and the rest. | |
| 28 return s[:i] | |
| 29 } | |
| 30 } | |
| 31 return s | |
| 32 } | |
| 33 | |
| 34 func sanitizeURL(s string) string { | |
| 35 switch u, err := url.Parse(s); { | |
| 36 case err != nil: | |
| 37 return "#invalid-url-stripped" | |
| 38 case u.Scheme != "http" && u.Scheme != "https": | |
| 39 return "#non-http-or-https-url-stripped" | |
| 40 case u.Host == "": | |
| 41 return "#relative-url-stripped" | |
| 42 default: | |
| 43 return s | |
|
hinoka
2017/05/02 00:36:15
return u.EscapedPath() seems safer. It'll force-es
xtof
2017/05/04 18:26:18
Not-properly escaped query parameters are generall
nodir
2017/05/04 22:11:00
Done.
| |
| 44 } | |
| 45 } | |
| 46 | |
| 47 // allowedElems defines allowed HTML elements. | |
| 48 // A key is a lower-case element name and a value defines allowed attributes. | |
| 49 // The latter is mapping from lower case attribute name to a value sanitizer. | |
| 50 // If the map is nil, no attributes are allowed. | |
| 51 var allowedElems = map[string]map[string]attrValueSanitizer{ | |
| 52 "a": { | |
|
Vadim Sh.
2017/05/02 00:16:24
may be a good idea to mandatory add rel="noopener"
nodir
2017/05/04 22:11:00
Done.
| |
| 53 "alt": alwaysSafe, | |
| 54 "href": sanitizeURL, | |
| 55 }, | |
| 56 | |
| 57 "br": nil, | |
| 58 "p": nil, | |
| 59 | |
| 60 "ol": nil, | |
| 61 "ul": nil, | |
| 62 "li": nil, | |
| 63 | |
| 64 "strong": nil, | |
| 65 "em": nil, | |
| 66 | |
| 67 "table": nil, | |
| 68 "tr": { | |
| 69 "rowspan": sanitizeNum, | |
| 70 "colspan": sanitizeNum, | |
| 71 }, | |
| 72 "td": { | |
| 73 "rowspan": sanitizeNum, | |
| 74 "colspan": sanitizeNum, | |
| 75 }, | |
| 76 } | |
| 77 | |
| 78 // Sanitize strips all HTML nodes except | |
| 79 // - paragraphs | |
| 80 // - strong/emphasized text | |
| 81 // - valid absolute HTTP(S) links with optional alt. | |
| 82 // - ordered/unordered lists | |
| 83 // - tables with optional colspan/rowspan. | |
| 84 // Anything else is stripped. | |
| 85 func Sanitize(r io.Reader, w io.Writer) error { | |
| 86 root, err := html.Parse(r) | |
| 87 if err != nil { | |
| 88 return err | |
| 89 } | |
| 90 | |
| 91 var writeError error | |
| 92 write := func(s string) { | |
| 93 if writeError == nil { | |
| 94 _, writeError = io.WriteString(w, s) | |
| 95 } | |
| 96 } | |
| 97 | |
| 98 var visit func(n *html.Node) | |
| 99 visitChildren := func(n *html.Node) { | |
| 100 for c := n.FirstChild; c != nil; c = c.NextSibling { | |
| 101 visit(c) | |
| 102 } | |
| 103 } | |
| 104 visit = func(n *html.Node) { | |
| 105 switch n.Type { | |
| 106 case html.TextNode: | |
| 107 // print it escaped. | |
| 108 write(html.EscapeString(n.Data)) | |
| 109 | |
| 110 case html.ElementNode: | |
| 111 tag := strings.ToLower(n.Data) | |
| 112 if tag == "br" { | |
| 113 // br is allowed and it should not be closed | |
| 114 write("<br>") | |
| 115 return | |
| 116 } | |
| 117 | |
| 118 allowedAttrs, allowed := allowedElems[tag] | |
| 119 if !allowed { | |
| 120 // ignore the tag, but visit children. | |
| 121 visitChildren(n) | |
| 122 return | |
| 123 } | |
| 124 | |
| 125 // this tag is allowed. | |
| 126 | |
| 127 write("<") | |
| 128 write(tag) | |
| 129 if allowedAttrs == nil { | |
| 130 // ignore attributes | |
| 131 } else { | |
| 132 // write only the allowed attributes. | |
| 133 for _, a := range n.Attr { | |
| 134 key := strings.ToLower(a.Key) | |
| 135 if sanitizer, ok := allowedAttrs[key]; a .Namespace == "" && ok { | |
|
hinoka
2017/05/02 00:36:15
Using template/html execute might be better here b
xtof
2017/05/04 18:26:18
I think what's here is fine, and (much) simpler.
nodir
2017/05/04 22:11:00
Acknowledged.
| |
| 136 write(" ") | |
| 137 write(key) | |
| 138 write("=\"") | |
| 139 write(html.EscapeString(sanitize r(a.Val))) | |
| 140 write("\"") | |
| 141 } | |
| 142 } | |
| 143 } | |
| 144 write(">") | |
| 145 | |
| 146 visitChildren(n) | |
| 147 | |
| 148 write("</") | |
| 149 write(n.Data) | |
|
xtof
2017/05/04 18:26:18
write(tag) would be slightly more "obviously safe"
nodir
2017/05/04 22:11:01
I've changed it a bit. I hope it is still OK becau
| |
| 150 write(">") | |
| 151 | |
| 152 default: | |
| 153 // ignore the tag, but visit children. | |
|
xtof
2017/05/04 18:26:18
I *think* this will cause the content of <script>
nodir
2017/05/04 22:11:00
indeed, i should not print contents of <script> an
| |
| 154 visitChildren(n) | |
| 155 } | |
| 156 } | |
| 157 visit(root) | |
| 158 return writeError | |
| 159 } | |
| OLD | NEW |