OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "net/base/escape.h" | |
6 | |
7 #include <algorithm> | |
8 | |
9 #include "base/logging.h" | |
10 #include "base/memory/scoped_ptr.h" | |
11 #include "base/strings/string_piece.h" | |
12 #include "base/strings/string_util.h" | |
13 #include "base/strings/utf_offset_string_conversions.h" | |
14 #include "base/strings/utf_string_conversions.h" | |
15 | |
16 namespace net { | |
17 | |
18 namespace { | |
19 | |
20 const char kHexString[] = "0123456789ABCDEF"; | |
21 inline char IntToHex(int i) { | |
22 DCHECK_GE(i, 0) << i << " not a hex value"; | |
23 DCHECK_LE(i, 15) << i << " not a hex value"; | |
24 return kHexString[i]; | |
25 } | |
26 | |
27 // A fast bit-vector map for ascii characters. | |
28 // | |
29 // Internally stores 256 bits in an array of 8 ints. | |
30 // Does quick bit-flicking to lookup needed characters. | |
31 struct Charmap { | |
32 bool Contains(unsigned char c) const { | |
33 return ((map[c >> 5] & (1 << (c & 31))) != 0); | |
34 } | |
35 | |
36 uint32 map[8]; | |
37 }; | |
38 | |
39 // Given text to escape and a Charmap defining which values to escape, | |
40 // return an escaped string. If use_plus is true, spaces are converted | |
41 // to +, otherwise, if spaces are in the charmap, they are converted to | |
42 // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if | |
43 // '%' is in the charmap, it is converted to %25. | |
44 std::string Escape(const std::string& text, | |
45 const Charmap& charmap, | |
46 bool use_plus, | |
47 bool keep_escaped = false) { | |
48 std::string escaped; | |
49 escaped.reserve(text.length() * 3); | |
50 for (unsigned int i = 0; i < text.length(); ++i) { | |
51 unsigned char c = static_cast<unsigned char>(text[i]); | |
52 if (use_plus && ' ' == c) { | |
53 escaped.push_back('+'); | |
54 } else if (keep_escaped && '%' == c && i + 2 < text.length() && | |
55 IsHexDigit(text[i + 1]) && IsHexDigit(text[i + 2])) { | |
56 escaped.push_back('%'); | |
57 } else if (charmap.Contains(c)) { | |
58 escaped.push_back('%'); | |
59 escaped.push_back(IntToHex(c >> 4)); | |
60 escaped.push_back(IntToHex(c & 0xf)); | |
61 } else { | |
62 escaped.push_back(c); | |
63 } | |
64 } | |
65 return escaped; | |
66 } | |
67 | |
68 // Contains nonzero when the corresponding character is unescapable for normal | |
69 // URLs. These characters are the ones that may change the parsing of a URL, so | |
70 // we don't want to unescape them sometimes. In many case we won't want to | |
71 // unescape spaces, but that is controlled by parameters to Unescape*. | |
72 // | |
73 // The basic rule is that we can't unescape anything that would changing parsing | |
74 // like # or ?. We also can't unescape &, =, or + since that could be part of a | |
75 // query and that could change the server's parsing of the query. Nor can we | |
76 // unescape \ since src/url/ will convert it to a /. | |
77 // | |
78 // Lastly, we can't unescape anything that doesn't have a canonical | |
79 // representation in a URL. This means that unescaping will change the URL, and | |
80 // you could get different behavior if you copy and paste the URL, or press | |
81 // enter in the URL bar. The list of characters that fall into this category | |
82 // are the ones labeled PASS (allow either escaped or unescaped) in the big | |
83 // lookup table at the top of url/url_canon_path.cc. Also, characters | |
84 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not | |
85 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are | |
86 // not unescaped, to avoid turning a valid url according to spec into an | |
87 // invalid one. | |
88 const char kUrlUnescape[128] = { | |
89 // NULL, control chars... | |
90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
92 // ' ' ! " # $ % & ' ( ) * + , - . / | |
93 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, | |
94 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? | |
95 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, | |
96 // @ A B C D E F G H I J K L M N O | |
97 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
98 // P Q R S T U V W X Y Z [ \ ] ^ _ | |
99 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, | |
100 // ` a b c d e f g h i j k l m n o | |
101 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
102 // p q r s t u v w x y z { | } ~ <NBSP> | |
103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 | |
104 }; | |
105 | |
106 // Attempts to unescape the sequence at |index| within |escaped_text|. If | |
107 // successful, sets |value| to the unescaped value. Returns whether | |
108 // unescaping succeeded. | |
109 template<typename STR> | |
110 bool UnescapeUnsignedCharAtIndex(const STR& escaped_text, | |
111 size_t index, | |
112 unsigned char* value) { | |
113 if ((index + 2) >= escaped_text.size()) | |
114 return false; | |
115 if (escaped_text[index] != '%') | |
116 return false; | |
117 const typename STR::value_type most_sig_digit( | |
118 static_cast<typename STR::value_type>(escaped_text[index + 1])); | |
119 const typename STR::value_type least_sig_digit( | |
120 static_cast<typename STR::value_type>(escaped_text[index + 2])); | |
121 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { | |
122 *value = HexDigitToInt(most_sig_digit) * 16 + | |
123 HexDigitToInt(least_sig_digit); | |
124 return true; | |
125 } | |
126 return false; | |
127 } | |
128 | |
129 // Returns true if there is an Arabic Language Mark at |index|. |first_byte| | |
130 // is the byte at |index|. | |
131 template<typename STR> | |
132 bool HasArabicLanguageMarkAtIndex(const STR& escaped_text, | |
133 unsigned char first_byte, | |
134 size_t index) { | |
135 if (first_byte != 0xD8) | |
136 return false; | |
137 unsigned char second_byte; | |
138 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte)) | |
139 return false; | |
140 return second_byte == 0x9c; | |
141 } | |
142 | |
143 // Returns true if there is a BiDi control char at |index|. |first_byte| is the | |
144 // byte at |index|. | |
145 template<typename STR> | |
146 bool HasThreeByteBidiControlCharAtIndex(const STR& escaped_text, | |
147 unsigned char first_byte, | |
148 size_t index) { | |
149 if (first_byte != 0xE2) | |
150 return false; | |
151 unsigned char second_byte; | |
152 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte)) | |
153 return false; | |
154 if (second_byte != 0x80 && second_byte != 0x81) | |
155 return false; | |
156 unsigned char third_byte; | |
157 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte)) | |
158 return false; | |
159 if (second_byte == 0x80) { | |
160 return third_byte == 0x8E || | |
161 third_byte == 0x8F || | |
162 (third_byte >= 0xAA && third_byte <= 0xAE); | |
163 } | |
164 return third_byte >= 0xA6 && third_byte <= 0xA9; | |
165 } | |
166 | |
167 // Unescapes |escaped_text| according to |rules|, returning the resulting | |
168 // string. Fills in an |adjustments| parameter, if non-NULL, so it reflects | |
169 // the alterations done to the string that are not one-character-to-one- | |
170 // character. The resulting |adjustments| will always be sorted by increasing | |
171 // offset. | |
172 template<typename STR> | |
173 STR UnescapeURLWithAdjustmentsImpl( | |
174 const STR& escaped_text, | |
175 UnescapeRule::Type rules, | |
176 base::OffsetAdjuster::Adjustments* adjustments) { | |
177 if (adjustments) | |
178 adjustments->clear(); | |
179 // Do not unescape anything, return the |escaped_text| text. | |
180 if (rules == UnescapeRule::NONE) | |
181 return escaped_text; | |
182 | |
183 // The output of the unescaping is always smaller than the input, so we can | |
184 // reserve the input size to make sure we have enough buffer and don't have | |
185 // to allocate in the loop below. | |
186 STR result; | |
187 result.reserve(escaped_text.length()); | |
188 | |
189 // Locations of adjusted text. | |
190 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) { | |
191 if (static_cast<unsigned char>(escaped_text[i]) >= 128) { | |
192 // Non ASCII character, append as is. | |
193 result.push_back(escaped_text[i]); | |
194 continue; | |
195 } | |
196 | |
197 unsigned char first_byte; | |
198 if (UnescapeUnsignedCharAtIndex(escaped_text, i, &first_byte)) { | |
199 // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi | |
200 // control characters are not allowed to appear unescaped in URLs: | |
201 // | |
202 // U+200E LEFT-TO-RIGHT MARK (%E2%80%8E) | |
203 // U+200F RIGHT-TO-LEFT MARK (%E2%80%8F) | |
204 // U+202A LEFT-TO-RIGHT EMBEDDING (%E2%80%AA) | |
205 // U+202B RIGHT-TO-LEFT EMBEDDING (%E2%80%AB) | |
206 // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC) | |
207 // U+202D LEFT-TO-RIGHT OVERRIDE (%E2%80%AD) | |
208 // U+202E RIGHT-TO-LEFT OVERRIDE (%E2%80%AE) | |
209 // | |
210 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC | |
211 // 3987 above has since added some new BiDi control characters. | |
212 // http://www.unicode.org/reports/tr9 | |
213 // | |
214 // U+061C ARABIC LETTER MARK (%D8%9C) | |
215 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6) | |
216 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7) | |
217 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8) | |
218 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9) | |
219 // | |
220 // However, some schemes such as data: and file: need to parse the exact | |
221 // binary data when loading the URL. For that reason, CONTROL_CHARS allows | |
222 // unescaping BiDi control characters. | |
223 // DO NOT use CONTROL_CHARS if the parsed URL is going to be displayed | |
224 // in the UI. | |
225 if (!(rules & UnescapeRule::CONTROL_CHARS)) { | |
226 if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) { | |
227 // Keep Arabic Language Mark escaped. | |
228 result.append(escaped_text, i, 6); | |
229 i += 5; | |
230 continue; | |
231 } | |
232 if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) { | |
233 // Keep BiDi control char escaped. | |
234 result.append(escaped_text, i, 9); | |
235 i += 8; | |
236 continue; | |
237 } | |
238 } | |
239 | |
240 if (first_byte >= 0x80 || // Unescape all high-bit characters. | |
241 // For 7-bit characters, the lookup table tells us all valid chars. | |
242 (kUrlUnescape[first_byte] || | |
243 // ...and we allow some additional unescaping when flags are set. | |
244 (first_byte == ' ' && (rules & UnescapeRule::SPACES)) || | |
245 // Allow any of the prohibited but non-control characters when | |
246 // we're doing "special" chars. | |
247 (first_byte > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || | |
248 // Additionally allow control characters if requested. | |
249 (first_byte < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { | |
250 // Use the unescaped version of the character. | |
251 if (adjustments) | |
252 adjustments->push_back(base::OffsetAdjuster::Adjustment(i, 3, 1)); | |
253 result.push_back(first_byte); | |
254 i += 2; | |
255 } else { | |
256 // Keep escaped. Append a percent and we'll get the following two | |
257 // digits on the next loops through. | |
258 result.push_back('%'); | |
259 } | |
260 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) && | |
261 escaped_text[i] == '+') { | |
262 result.push_back(' '); | |
263 } else { | |
264 // Normal case for unescaped characters. | |
265 result.push_back(escaped_text[i]); | |
266 } | |
267 } | |
268 | |
269 return result; | |
270 } | |
271 | |
272 template <class str> | |
273 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) { | |
274 static const struct { | |
275 char key; | |
276 const char* replacement; | |
277 } kCharsToEscape[] = { | |
278 { '<', "<" }, | |
279 { '>', ">" }, | |
280 { '&', "&" }, | |
281 { '"', """ }, | |
282 { '\'', "'" }, | |
283 }; | |
284 size_t k; | |
285 for (k = 0; k < arraysize(kCharsToEscape); ++k) { | |
286 if (c == kCharsToEscape[k].key) { | |
287 const char* p = kCharsToEscape[k].replacement; | |
288 while (*p) | |
289 output->push_back(*p++); | |
290 break; | |
291 } | |
292 } | |
293 if (k == arraysize(kCharsToEscape)) | |
294 output->push_back(c); | |
295 } | |
296 | |
297 template <class str> | |
298 str EscapeForHTMLImpl(const str& input) { | |
299 str result; | |
300 result.reserve(input.size()); // Optimize for no escaping. | |
301 | |
302 for (typename str::const_iterator i = input.begin(); i != input.end(); ++i) | |
303 AppendEscapedCharForHTMLImpl(*i, &result); | |
304 | |
305 return result; | |
306 } | |
307 | |
308 // Everything except alphanumerics and !'()*-._~ | |
309 // See RFC 2396 for the list of reserved characters. | |
310 static const Charmap kQueryCharmap = {{ | |
311 0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L, | |
312 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | |
313 }}; | |
314 | |
315 // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|} | |
316 static const Charmap kPathCharmap = {{ | |
317 0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L, | |
318 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | |
319 }}; | |
320 | |
321 #if defined(OS_MACOSX) | |
322 // non-printable, non-7bit, and (including space) "#%<>[\]^`{|} | |
323 static const Charmap kNSURLCharmap = {{ | |
324 0xffffffffL, 0x5000002dL, 0x78000000L, 0xb8000001L, | |
325 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | |
326 }}; | |
327 #endif // defined(OS_MACOSX) | |
328 | |
329 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|} | |
330 static const Charmap kUrlEscape = {{ | |
331 0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L, | |
332 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | |
333 }}; | |
334 | |
335 // non-7bit | |
336 static const Charmap kNonASCIICharmap = {{ | |
337 0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L, | |
338 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | |
339 }}; | |
340 | |
341 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and | |
342 // !'()*-._~#[] | |
343 static const Charmap kExternalHandlerCharmap = {{ | |
344 0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L, | |
345 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | |
346 }}; | |
347 | |
348 } // namespace | |
349 | |
350 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) { | |
351 return Escape(text, kQueryCharmap, use_plus); | |
352 } | |
353 | |
354 std::string EscapePath(const std::string& path) { | |
355 return Escape(path, kPathCharmap, false); | |
356 } | |
357 | |
358 #if defined(OS_MACOSX) | |
359 std::string EscapeNSURLPrecursor(const std::string& precursor) { | |
360 return Escape(precursor, kNSURLCharmap, false, true); | |
361 } | |
362 #endif // defined(OS_MACOSX) | |
363 | |
364 std::string EscapeUrlEncodedData(const std::string& path, bool use_plus) { | |
365 return Escape(path, kUrlEscape, use_plus); | |
366 } | |
367 | |
368 std::string EscapeNonASCII(const std::string& input) { | |
369 return Escape(input, kNonASCIICharmap, false); | |
370 } | |
371 | |
372 std::string EscapeExternalHandlerValue(const std::string& text) { | |
373 return Escape(text, kExternalHandlerCharmap, false, true); | |
374 } | |
375 | |
376 void AppendEscapedCharForHTML(char c, std::string* output) { | |
377 AppendEscapedCharForHTMLImpl(c, output); | |
378 } | |
379 | |
380 std::string EscapeForHTML(const std::string& input) { | |
381 return EscapeForHTMLImpl(input); | |
382 } | |
383 | |
384 base::string16 EscapeForHTML(const base::string16& input) { | |
385 return EscapeForHTMLImpl(input); | |
386 } | |
387 | |
388 std::string UnescapeURLComponent(const std::string& escaped_text, | |
389 UnescapeRule::Type rules) { | |
390 return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, NULL); | |
391 } | |
392 | |
393 base::string16 UnescapeURLComponent(const base::string16& escaped_text, | |
394 UnescapeRule::Type rules) { | |
395 return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, NULL); | |
396 } | |
397 | |
398 base::string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text, | |
399 UnescapeRule::Type rules) { | |
400 return UnescapeAndDecodeUTF8URLComponentWithAdjustments(text, rules, NULL); | |
401 } | |
402 | |
403 base::string16 UnescapeAndDecodeUTF8URLComponentWithAdjustments( | |
404 const std::string& text, | |
405 UnescapeRule::Type rules, | |
406 base::OffsetAdjuster::Adjustments* adjustments) { | |
407 base::string16 result; | |
408 base::OffsetAdjuster::Adjustments unescape_adjustments; | |
409 std::string unescaped_url(UnescapeURLWithAdjustmentsImpl( | |
410 text, rules, &unescape_adjustments)); | |
411 if (base::UTF8ToUTF16WithAdjustments(unescaped_url.data(), | |
412 unescaped_url.length(), | |
413 &result, adjustments)) { | |
414 // Character set looks like it's valid. | |
415 if (adjustments) { | |
416 base::OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments, | |
417 adjustments); | |
418 } | |
419 return result; | |
420 } | |
421 // Character set is not valid. Return the escaped version. | |
422 return base::UTF8ToUTF16WithAdjustments(text, adjustments); | |
423 } | |
424 | |
425 base::string16 UnescapeForHTML(const base::string16& input) { | |
426 static const struct { | |
427 const char* ampersand_code; | |
428 const char replacement; | |
429 } kEscapeToChars[] = { | |
430 { "<", '<' }, | |
431 { ">", '>' }, | |
432 { "&", '&' }, | |
433 { """, '"' }, | |
434 { "'", '\''}, | |
435 }; | |
436 | |
437 if (input.find(base::ASCIIToUTF16("&")) == std::string::npos) | |
438 return input; | |
439 | |
440 base::string16 ampersand_chars[arraysize(kEscapeToChars)]; | |
441 base::string16 text(input); | |
442 for (base::string16::iterator iter = text.begin(); | |
443 iter != text.end(); ++iter) { | |
444 if (*iter == '&') { | |
445 // Potential ampersand encode char. | |
446 size_t index = iter - text.begin(); | |
447 for (size_t i = 0; i < arraysize(kEscapeToChars); i++) { | |
448 if (ampersand_chars[i].empty()) { | |
449 ampersand_chars[i] = | |
450 base::ASCIIToUTF16(kEscapeToChars[i].ampersand_code); | |
451 } | |
452 if (text.find(ampersand_chars[i], index) == index) { | |
453 text.replace(iter, iter + ampersand_chars[i].length(), | |
454 1, kEscapeToChars[i].replacement); | |
455 break; | |
456 } | |
457 } | |
458 } | |
459 } | |
460 return text; | |
461 } | |
462 | |
463 } // namespace net | |
OLD | NEW |