OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "net/base/escape.h" | 5 #include "net/base/escape.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 | 8 |
9 #include "base/logging.h" | 9 #include "base/logging.h" |
10 #include "base/memory/scoped_ptr.h" | 10 #include "base/memory/scoped_ptr.h" |
(...skipping 22 matching lines...) Expand all Loading... |
33 return ((map[c >> 5] & (1 << (c & 31))) != 0); | 33 return ((map[c >> 5] & (1 << (c & 31))) != 0); |
34 } | 34 } |
35 | 35 |
36 uint32 map[8]; | 36 uint32 map[8]; |
37 }; | 37 }; |
38 | 38 |
39 // Given text to escape and a Charmap defining which values to escape, | 39 // Given text to escape and a Charmap defining which values to escape, |
40 // return an escaped string. If use_plus is true, spaces are converted | 40 // return an escaped string. If use_plus is true, spaces are converted |
41 // to +, otherwise, if spaces are in the charmap, they are converted to | 41 // to +, otherwise, if spaces are in the charmap, they are converted to |
42 // %20. | 42 // %20. |
43 std::string Escape(const std::string& text, const Charmap& charmap, | 43 std::string Escape(const std::string& text, |
| 44 const Charmap& charmap, |
44 bool use_plus) { | 45 bool use_plus) { |
45 std::string escaped; | 46 std::string escaped; |
46 escaped.reserve(text.length() * 3); | 47 escaped.reserve(text.length() * 3); |
47 for (unsigned int i = 0; i < text.length(); ++i) { | 48 for (unsigned int i = 0; i < text.length(); ++i) { |
48 unsigned char c = static_cast<unsigned char>(text[i]); | 49 unsigned char c = static_cast<unsigned char>(text[i]); |
49 if (use_plus && ' ' == c) { | 50 if (use_plus && ' ' == c) { |
50 escaped.push_back('+'); | 51 escaped.push_back('+'); |
51 } else if (charmap.Contains(c)) { | 52 } else if (charmap.Contains(c)) { |
52 escaped.push_back('%'); | 53 escaped.push_back('%'); |
53 escaped.push_back(IntToHex(c >> 4)); | 54 escaped.push_back(IntToHex(c >> 4)); |
(...skipping 19 matching lines...) Expand all Loading... |
73 // representation in a URL. This means that unescaping will change the URL, and | 74 // representation in a URL. This means that unescaping will change the URL, and |
74 // you could get different behavior if you copy and paste the URL, or press | 75 // you could get different behavior if you copy and paste the URL, or press |
75 // enter in the URL bar. The list of characters that fall into this category | 76 // enter in the URL bar. The list of characters that fall into this category |
76 // are the ones labeled PASS (allow either escaped or unescaped) in the big | 77 // are the ones labeled PASS (allow either escaped or unescaped) in the big |
77 // lookup table at the top of url/url_canon_path.cc. Also, characters | 78 // lookup table at the top of url/url_canon_path.cc. Also, characters |
78 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not | 79 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not |
79 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are | 80 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are |
80 // not unescaped, to avoid turning a valid url according to spec into an | 81 // not unescaped, to avoid turning a valid url according to spec into an |
81 // invalid one. | 82 // invalid one. |
82 const char kUrlUnescape[128] = { | 83 const char kUrlUnescape[128] = { |
83 // NULL, control chars... | 84 // NULL, control chars... |
84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
86 // ' ' ! " # $ % & ' ( ) * + , - . / | 87 // ' ' ! " # $ % & ' ( ) * + , - . / |
87 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, | 88 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, |
88 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? | 89 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? |
89 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, | 90 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, |
90 // @ A B C D E F G H I J K L M N O | 91 // @ A B C D E F G H I J K L M N O |
91 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 92 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
92 // P Q R S T U V W X Y Z [ \ ] ^ _ | 93 // P Q R S T U V W X Y Z [ \ ] ^ _ |
93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, | 94 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, |
94 // ` a b c d e f g h i j k l m n o | 95 // ` a b c d e f g h i j k l m n o |
95 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 96 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
96 // p q r s t u v w x y z { | } ~ <NBSP> | 97 // p q r s t u v w x y z { | } ~ <NBSP> |
97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 | 98 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0}; |
98 }; | |
99 | 99 |
100 // Attempts to unescape the sequence at |index| within |escaped_text|. If | 100 // Attempts to unescape the sequence at |index| within |escaped_text|. If |
101 // successful, sets |value| to the unescaped value. Returns whether | 101 // successful, sets |value| to the unescaped value. Returns whether |
102 // unescaping succeeded. | 102 // unescaping succeeded. |
103 template<typename STR> | 103 template <typename STR> |
104 bool UnescapeUnsignedCharAtIndex(const STR& escaped_text, | 104 bool UnescapeUnsignedCharAtIndex(const STR& escaped_text, |
105 size_t index, | 105 size_t index, |
106 unsigned char* value) { | 106 unsigned char* value) { |
107 if ((index + 2) >= escaped_text.size()) | 107 if ((index + 2) >= escaped_text.size()) |
108 return false; | 108 return false; |
109 if (escaped_text[index] != '%') | 109 if (escaped_text[index] != '%') |
110 return false; | 110 return false; |
111 const typename STR::value_type most_sig_digit( | 111 const typename STR::value_type most_sig_digit( |
112 static_cast<typename STR::value_type>(escaped_text[index + 1])); | 112 static_cast<typename STR::value_type>(escaped_text[index + 1])); |
113 const typename STR::value_type least_sig_digit( | 113 const typename STR::value_type least_sig_digit( |
114 static_cast<typename STR::value_type>(escaped_text[index + 2])); | 114 static_cast<typename STR::value_type>(escaped_text[index + 2])); |
115 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { | 115 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { |
116 *value = HexDigitToInt(most_sig_digit) * 16 + | 116 *value = |
117 HexDigitToInt(least_sig_digit); | 117 HexDigitToInt(most_sig_digit) * 16 + HexDigitToInt(least_sig_digit); |
118 return true; | 118 return true; |
119 } | 119 } |
120 return false; | 120 return false; |
121 } | 121 } |
122 | 122 |
123 // Unescapes |escaped_text| according to |rules|, returning the resulting | 123 // Unescapes |escaped_text| according to |rules|, returning the resulting |
124 // string. Fills in an |adjustments| parameter, if non-NULL, so it reflects | 124 // string. Fills in an |adjustments| parameter, if non-NULL, so it reflects |
125 // the alterations done to the string that are not one-character-to-one- | 125 // the alterations done to the string that are not one-character-to-one- |
126 // character. The resulting |adjustments| will always be sorted by increasing | 126 // character. The resulting |adjustments| will always be sorted by increasing |
127 // offset. | 127 // offset. |
128 template<typename STR> | 128 template <typename STR> |
129 STR UnescapeURLWithAdjustmentsImpl( | 129 STR UnescapeURLWithAdjustmentsImpl( |
130 const STR& escaped_text, | 130 const STR& escaped_text, |
131 UnescapeRule::Type rules, | 131 UnescapeRule::Type rules, |
132 base::OffsetAdjuster::Adjustments* adjustments) { | 132 base::OffsetAdjuster::Adjustments* adjustments) { |
133 if (adjustments) | 133 if (adjustments) |
134 adjustments->clear(); | 134 adjustments->clear(); |
135 // Do not unescape anything, return the |escaped_text| text. | 135 // Do not unescape anything, return the |escaped_text| text. |
136 if (rules == UnescapeRule::NONE) | 136 if (rules == UnescapeRule::NONE) |
137 return escaped_text; | 137 return escaped_text; |
138 | 138 |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
182 i += 5; | 182 i += 5; |
183 continue; | 183 continue; |
184 } | 184 } |
185 | 185 |
186 // Check for other BiDi control characters. | 186 // Check for other BiDi control characters. |
187 if ((first_byte == 0xE2) && | 187 if ((first_byte == 0xE2) && |
188 UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) && | 188 UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) && |
189 ((second_byte == 0x80) || (second_byte == 0x81))) { | 189 ((second_byte == 0x80) || (second_byte == 0x81))) { |
190 unsigned char third_byte; | 190 unsigned char third_byte; |
191 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &third_byte) && | 191 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &third_byte) && |
192 ((second_byte == 0x80) ? | 192 ((second_byte == 0x80) |
193 ((third_byte == 0x8E) || (third_byte == 0x8F) || | 193 ? ((third_byte == 0x8E) || (third_byte == 0x8F) || |
194 ((third_byte >= 0xAA) && (third_byte <= 0xAE))) : | 194 ((third_byte >= 0xAA) && (third_byte <= 0xAE))) |
195 ((third_byte >= 0xA6) && (third_byte <= 0xA9)))) { | 195 : ((third_byte >= 0xA6) && (third_byte <= 0xA9)))) { |
196 result.append(escaped_text, i, 9); | 196 result.append(escaped_text, i, 9); |
197 i += 8; | 197 i += 8; |
198 continue; | 198 continue; |
199 } | 199 } |
200 } | 200 } |
201 | 201 |
202 if (first_byte >= 0x80 || // Unescape all high-bit characters. | 202 if (first_byte >= 0x80 || // Unescape all high-bit characters. |
203 // For 7-bit characters, the lookup table tells us all valid chars. | 203 // For 7-bit characters, the lookup table tells us all valid chars. |
204 (kUrlUnescape[first_byte] || | 204 (kUrlUnescape[first_byte] || |
205 // ...and we allow some additional unescaping when flags are set. | 205 // ...and we allow some additional unescaping when flags are set. |
(...skipping 24 matching lines...) Expand all Loading... |
230 | 230 |
231 return result; | 231 return result; |
232 } | 232 } |
233 | 233 |
234 template <class str> | 234 template <class str> |
235 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) { | 235 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) { |
236 static const struct { | 236 static const struct { |
237 char key; | 237 char key; |
238 const char* replacement; | 238 const char* replacement; |
239 } kCharsToEscape[] = { | 239 } kCharsToEscape[] = { |
240 { '<', "<" }, | 240 {'<', "<"}, |
241 { '>', ">" }, | 241 {'>', ">"}, |
242 { '&', "&" }, | 242 {'&', "&"}, |
243 { '"', """ }, | 243 {'"', """}, |
244 { '\'', "'" }, | 244 {'\'', "'"}, |
245 }; | 245 }; |
246 size_t k; | 246 size_t k; |
247 for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) { | 247 for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) { |
248 if (c == kCharsToEscape[k].key) { | 248 if (c == kCharsToEscape[k].key) { |
249 const char* p = kCharsToEscape[k].replacement; | 249 const char* p = kCharsToEscape[k].replacement; |
250 while (*p) | 250 while (*p) |
251 output->push_back(*p++); | 251 output->push_back(*p++); |
252 break; | 252 break; |
253 } | 253 } |
254 } | 254 } |
255 if (k == ARRAYSIZE_UNSAFE(kCharsToEscape)) | 255 if (k == ARRAYSIZE_UNSAFE(kCharsToEscape)) |
256 output->push_back(c); | 256 output->push_back(c); |
257 } | 257 } |
258 | 258 |
259 template <class str> | 259 template <class str> |
260 str EscapeForHTMLImpl(const str& input) { | 260 str EscapeForHTMLImpl(const str& input) { |
261 str result; | 261 str result; |
262 result.reserve(input.size()); // Optimize for no escaping. | 262 result.reserve(input.size()); // Optimize for no escaping. |
263 | 263 |
264 for (typename str::const_iterator i = input.begin(); i != input.end(); ++i) | 264 for (typename str::const_iterator i = input.begin(); i != input.end(); ++i) |
265 AppendEscapedCharForHTMLImpl(*i, &result); | 265 AppendEscapedCharForHTMLImpl(*i, &result); |
266 | 266 |
267 return result; | 267 return result; |
268 } | 268 } |
269 | 269 |
270 // Everything except alphanumerics and !'()*-._~ | 270 // Everything except alphanumerics and !'()*-._~ |
271 // See RFC 2396 for the list of reserved characters. | 271 // See RFC 2396 for the list of reserved characters. |
272 static const Charmap kQueryCharmap = {{ | 272 static const Charmap kQueryCharmap = {{0xffffffffL, 0xfc00987dL, 0x78000001L, |
273 0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L, | 273 0xb8000001L, 0xffffffffL, 0xffffffffL, |
274 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | 274 0xffffffffL, 0xffffffffL}}; |
275 }}; | |
276 | 275 |
277 // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|} | 276 // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|} |
278 static const Charmap kPathCharmap = {{ | 277 static const Charmap kPathCharmap = {{0xffffffffL, 0xd400002dL, 0x78000000L, |
279 0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L, | 278 0xb8000001L, 0xffffffffL, 0xffffffffL, |
280 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | 279 0xffffffffL, 0xffffffffL}}; |
281 }}; | |
282 | 280 |
283 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|} | 281 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|} |
284 static const Charmap kUrlEscape = {{ | 282 static const Charmap kUrlEscape = {{0xffffffffL, 0xf80008fdL, 0x78000001L, |
285 0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L, | 283 0xb8000001L, 0xffffffffL, 0xffffffffL, |
286 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | 284 0xffffffffL, 0xffffffffL}}; |
287 }}; | |
288 | 285 |
289 // non-7bit | 286 // non-7bit |
290 static const Charmap kNonASCIICharmap = {{ | 287 static const Charmap kNonASCIICharmap = {{0x00000000L, 0x00000000L, 0x00000000L, |
291 0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L, | 288 0x00000000L, 0xffffffffL, 0xffffffffL, |
292 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | 289 0xffffffffL, 0xffffffffL}}; |
293 }}; | |
294 | 290 |
295 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and | 291 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and |
296 // !'()*-._~% | 292 // !'()*-._~% |
297 static const Charmap kExternalHandlerCharmap = {{ | 293 static const Charmap kExternalHandlerCharmap = { |
298 0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L, | 294 {0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L, 0xffffffffL, |
299 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | 295 0xffffffffL, 0xffffffffL, 0xffffffffL}}; |
300 }}; | |
301 | 296 |
302 } // namespace | 297 } // namespace |
303 | 298 |
304 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) { | 299 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) { |
305 return Escape(text, kQueryCharmap, use_plus); | 300 return Escape(text, kQueryCharmap, use_plus); |
306 } | 301 } |
307 | 302 |
308 std::string EscapePath(const std::string& path) { | 303 std::string EscapePath(const std::string& path) { |
309 return Escape(path, kPathCharmap, false); | 304 return Escape(path, kPathCharmap, false); |
310 } | 305 } |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
347 UnescapeRule::Type rules) { | 342 UnescapeRule::Type rules) { |
348 return UnescapeAndDecodeUTF8URLComponentWithAdjustments(text, rules, NULL); | 343 return UnescapeAndDecodeUTF8URLComponentWithAdjustments(text, rules, NULL); |
349 } | 344 } |
350 | 345 |
351 base::string16 UnescapeAndDecodeUTF8URLComponentWithAdjustments( | 346 base::string16 UnescapeAndDecodeUTF8URLComponentWithAdjustments( |
352 const std::string& text, | 347 const std::string& text, |
353 UnescapeRule::Type rules, | 348 UnescapeRule::Type rules, |
354 base::OffsetAdjuster::Adjustments* adjustments) { | 349 base::OffsetAdjuster::Adjustments* adjustments) { |
355 base::string16 result; | 350 base::string16 result; |
356 base::OffsetAdjuster::Adjustments unescape_adjustments; | 351 base::OffsetAdjuster::Adjustments unescape_adjustments; |
357 std::string unescaped_url(UnescapeURLWithAdjustmentsImpl( | 352 std::string unescaped_url( |
358 text, rules, &unescape_adjustments)); | 353 UnescapeURLWithAdjustmentsImpl(text, rules, &unescape_adjustments)); |
359 if (base::UTF8ToUTF16WithAdjustments(unescaped_url.data(), | 354 if (base::UTF8ToUTF16WithAdjustments( |
360 unescaped_url.length(), | 355 unescaped_url.data(), unescaped_url.length(), &result, adjustments)) { |
361 &result, adjustments)) { | |
362 // Character set looks like it's valid. | 356 // Character set looks like it's valid. |
363 if (adjustments) { | 357 if (adjustments) { |
364 base::OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments, | 358 base::OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments, |
365 adjustments); | 359 adjustments); |
366 } | 360 } |
367 return result; | 361 return result; |
368 } | 362 } |
369 // Character set is not valid. Return the escaped version. | 363 // Character set is not valid. Return the escaped version. |
370 return base::UTF8ToUTF16WithAdjustments(text, adjustments); | 364 return base::UTF8ToUTF16WithAdjustments(text, adjustments); |
371 } | 365 } |
372 | 366 |
373 base::string16 UnescapeForHTML(const base::string16& input) { | 367 base::string16 UnescapeForHTML(const base::string16& input) { |
374 static const struct { | 368 static const struct { |
375 const char* ampersand_code; | 369 const char* ampersand_code; |
376 const char replacement; | 370 const char replacement; |
377 } kEscapeToChars[] = { | 371 } kEscapeToChars[] = { |
378 { "<", '<' }, | 372 {"<", '<'}, |
379 { ">", '>' }, | 373 {">", '>'}, |
380 { "&", '&' }, | 374 {"&", '&'}, |
381 { """, '"' }, | 375 {""", '"'}, |
382 { "'", '\''}, | 376 {"'", '\''}, |
383 }; | 377 }; |
384 | 378 |
385 if (input.find(base::ASCIIToUTF16("&")) == std::string::npos) | 379 if (input.find(base::ASCIIToUTF16("&")) == std::string::npos) |
386 return input; | 380 return input; |
387 | 381 |
388 base::string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)]; | 382 base::string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)]; |
389 base::string16 text(input); | 383 base::string16 text(input); |
390 for (base::string16::iterator iter = text.begin(); | 384 for (base::string16::iterator iter = text.begin(); iter != text.end(); |
391 iter != text.end(); ++iter) { | 385 ++iter) { |
392 if (*iter == '&') { | 386 if (*iter == '&') { |
393 // Potential ampersand encode char. | 387 // Potential ampersand encode char. |
394 size_t index = iter - text.begin(); | 388 size_t index = iter - text.begin(); |
395 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) { | 389 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) { |
396 if (ampersand_chars[i].empty()) { | 390 if (ampersand_chars[i].empty()) { |
397 ampersand_chars[i] = | 391 ampersand_chars[i] = |
398 base::ASCIIToUTF16(kEscapeToChars[i].ampersand_code); | 392 base::ASCIIToUTF16(kEscapeToChars[i].ampersand_code); |
399 } | 393 } |
400 if (text.find(ampersand_chars[i], index) == index) { | 394 if (text.find(ampersand_chars[i], index) == index) { |
401 text.replace(iter, iter + ampersand_chars[i].length(), | 395 text.replace(iter, |
402 1, kEscapeToChars[i].replacement); | 396 iter + ampersand_chars[i].length(), |
| 397 1, |
| 398 kEscapeToChars[i].replacement); |
403 break; | 399 break; |
404 } | 400 } |
405 } | 401 } |
406 } | 402 } |
407 } | 403 } |
408 return text; | 404 return text; |
409 } | 405 } |
410 | 406 |
411 } // namespace net | 407 } // namespace net |
OLD | NEW |