| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "net/base/escape.h" | |
| 6 | |
| 7 #include <algorithm> | |
| 8 | |
| 9 #include "base/logging.h" | |
| 10 #include "base/memory/scoped_ptr.h" | |
| 11 #include "base/strings/string_piece.h" | |
| 12 #include "base/strings/string_util.h" | |
| 13 #include "base/strings/utf_offset_string_conversions.h" | |
| 14 #include "base/strings/utf_string_conversions.h" | |
| 15 | |
| 16 namespace net { | |
| 17 | |
| 18 namespace { | |
| 19 | |
| 20 const char kHexString[] = "0123456789ABCDEF"; | |
| 21 inline char IntToHex(int i) { | |
| 22 DCHECK_GE(i, 0) << i << " not a hex value"; | |
| 23 DCHECK_LE(i, 15) << i << " not a hex value"; | |
| 24 return kHexString[i]; | |
| 25 } | |
| 26 | |
| 27 // A fast bit-vector map for ascii characters. | |
| 28 // | |
| 29 // Internally stores 256 bits in an array of 8 ints. | |
| 30 // Does quick bit-flicking to lookup needed characters. | |
| 31 struct Charmap { | |
| 32 bool Contains(unsigned char c) const { | |
| 33 return ((map[c >> 5] & (1 << (c & 31))) != 0); | |
| 34 } | |
| 35 | |
| 36 uint32 map[8]; | |
| 37 }; | |
| 38 | |
| 39 // Given text to escape and a Charmap defining which values to escape, | |
| 40 // return an escaped string. If use_plus is true, spaces are converted | |
| 41 // to +, otherwise, if spaces are in the charmap, they are converted to | |
| 42 // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if | |
| 43 // '%' is in the charmap, it is converted to %25. | |
| 44 std::string Escape(const std::string& text, | |
| 45 const Charmap& charmap, | |
| 46 bool use_plus, | |
| 47 bool keep_escaped = false) { | |
| 48 std::string escaped; | |
| 49 escaped.reserve(text.length() * 3); | |
| 50 for (unsigned int i = 0; i < text.length(); ++i) { | |
| 51 unsigned char c = static_cast<unsigned char>(text[i]); | |
| 52 if (use_plus && ' ' == c) { | |
| 53 escaped.push_back('+'); | |
| 54 } else if (keep_escaped && '%' == c && i + 2 < text.length() && | |
| 55 IsHexDigit(text[i + 1]) && IsHexDigit(text[i + 2])) { | |
| 56 escaped.push_back('%'); | |
| 57 } else if (charmap.Contains(c)) { | |
| 58 escaped.push_back('%'); | |
| 59 escaped.push_back(IntToHex(c >> 4)); | |
| 60 escaped.push_back(IntToHex(c & 0xf)); | |
| 61 } else { | |
| 62 escaped.push_back(c); | |
| 63 } | |
| 64 } | |
| 65 return escaped; | |
| 66 } | |
| 67 | |
| 68 // Contains nonzero when the corresponding character is unescapable for normal | |
| 69 // URLs. These characters are the ones that may change the parsing of a URL, so | |
| 70 // we don't want to unescape them sometimes. In many case we won't want to | |
| 71 // unescape spaces, but that is controlled by parameters to Unescape*. | |
| 72 // | |
| 73 // The basic rule is that we can't unescape anything that would changing parsing | |
| 74 // like # or ?. We also can't unescape &, =, or + since that could be part of a | |
| 75 // query and that could change the server's parsing of the query. Nor can we | |
| 76 // unescape \ since src/url/ will convert it to a /. | |
| 77 // | |
| 78 // Lastly, we can't unescape anything that doesn't have a canonical | |
| 79 // representation in a URL. This means that unescaping will change the URL, and | |
| 80 // you could get different behavior if you copy and paste the URL, or press | |
| 81 // enter in the URL bar. The list of characters that fall into this category | |
| 82 // are the ones labeled PASS (allow either escaped or unescaped) in the big | |
| 83 // lookup table at the top of url/url_canon_path.cc. Also, characters | |
| 84 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not | |
| 85 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are | |
| 86 // not unescaped, to avoid turning a valid url according to spec into an | |
| 87 // invalid one. | |
| 88 const char kUrlUnescape[128] = { | |
| 89 // NULL, control chars... | |
| 90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 92 // ' ' ! " # $ % & ' ( ) * + , - . / | |
| 93 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, | |
| 94 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? | |
| 95 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, | |
| 96 // @ A B C D E F G H I J K L M N O | |
| 97 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 98 // P Q R S T U V W X Y Z [ \ ] ^ _ | |
| 99 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, | |
| 100 // ` a b c d e f g h i j k l m n o | |
| 101 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 102 // p q r s t u v w x y z { | } ~ <NBSP> | |
| 103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 | |
| 104 }; | |
| 105 | |
| 106 // Attempts to unescape the sequence at |index| within |escaped_text|. If | |
| 107 // successful, sets |value| to the unescaped value. Returns whether | |
| 108 // unescaping succeeded. | |
| 109 template<typename STR> | |
| 110 bool UnescapeUnsignedCharAtIndex(const STR& escaped_text, | |
| 111 size_t index, | |
| 112 unsigned char* value) { | |
| 113 if ((index + 2) >= escaped_text.size()) | |
| 114 return false; | |
| 115 if (escaped_text[index] != '%') | |
| 116 return false; | |
| 117 const typename STR::value_type most_sig_digit( | |
| 118 static_cast<typename STR::value_type>(escaped_text[index + 1])); | |
| 119 const typename STR::value_type least_sig_digit( | |
| 120 static_cast<typename STR::value_type>(escaped_text[index + 2])); | |
| 121 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { | |
| 122 *value = HexDigitToInt(most_sig_digit) * 16 + | |
| 123 HexDigitToInt(least_sig_digit); | |
| 124 return true; | |
| 125 } | |
| 126 return false; | |
| 127 } | |
| 128 | |
| 129 // Returns true if there is an Arabic Language Mark at |index|. |first_byte| | |
| 130 // is the byte at |index|. | |
| 131 template<typename STR> | |
| 132 bool HasArabicLanguageMarkAtIndex(const STR& escaped_text, | |
| 133 unsigned char first_byte, | |
| 134 size_t index) { | |
| 135 if (first_byte != 0xD8) | |
| 136 return false; | |
| 137 unsigned char second_byte; | |
| 138 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte)) | |
| 139 return false; | |
| 140 return second_byte == 0x9c; | |
| 141 } | |
| 142 | |
| 143 // Returns true if there is a BiDi control char at |index|. |first_byte| is the | |
| 144 // byte at |index|. | |
| 145 template<typename STR> | |
| 146 bool HasThreeByteBidiControlCharAtIndex(const STR& escaped_text, | |
| 147 unsigned char first_byte, | |
| 148 size_t index) { | |
| 149 if (first_byte != 0xE2) | |
| 150 return false; | |
| 151 unsigned char second_byte; | |
| 152 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte)) | |
| 153 return false; | |
| 154 if (second_byte != 0x80 && second_byte != 0x81) | |
| 155 return false; | |
| 156 unsigned char third_byte; | |
| 157 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte)) | |
| 158 return false; | |
| 159 if (second_byte == 0x80) { | |
| 160 return third_byte == 0x8E || | |
| 161 third_byte == 0x8F || | |
| 162 (third_byte >= 0xAA && third_byte <= 0xAE); | |
| 163 } | |
| 164 return third_byte >= 0xA6 && third_byte <= 0xA9; | |
| 165 } | |
| 166 | |
| 167 // Unescapes |escaped_text| according to |rules|, returning the resulting | |
| 168 // string. Fills in an |adjustments| parameter, if non-NULL, so it reflects | |
| 169 // the alterations done to the string that are not one-character-to-one- | |
| 170 // character. The resulting |adjustments| will always be sorted by increasing | |
| 171 // offset. | |
| 172 template<typename STR> | |
| 173 STR UnescapeURLWithAdjustmentsImpl( | |
| 174 const STR& escaped_text, | |
| 175 UnescapeRule::Type rules, | |
| 176 base::OffsetAdjuster::Adjustments* adjustments) { | |
| 177 if (adjustments) | |
| 178 adjustments->clear(); | |
| 179 // Do not unescape anything, return the |escaped_text| text. | |
| 180 if (rules == UnescapeRule::NONE) | |
| 181 return escaped_text; | |
| 182 | |
| 183 // The output of the unescaping is always smaller than the input, so we can | |
| 184 // reserve the input size to make sure we have enough buffer and don't have | |
| 185 // to allocate in the loop below. | |
| 186 STR result; | |
| 187 result.reserve(escaped_text.length()); | |
| 188 | |
| 189 // Locations of adjusted text. | |
| 190 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) { | |
| 191 if (static_cast<unsigned char>(escaped_text[i]) >= 128) { | |
| 192 // Non ASCII character, append as is. | |
| 193 result.push_back(escaped_text[i]); | |
| 194 continue; | |
| 195 } | |
| 196 | |
| 197 unsigned char first_byte; | |
| 198 if (UnescapeUnsignedCharAtIndex(escaped_text, i, &first_byte)) { | |
| 199 // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi | |
| 200 // control characters are not allowed to appear unescaped in URLs: | |
| 201 // | |
| 202 // U+200E LEFT-TO-RIGHT MARK (%E2%80%8E) | |
| 203 // U+200F RIGHT-TO-LEFT MARK (%E2%80%8F) | |
| 204 // U+202A LEFT-TO-RIGHT EMBEDDING (%E2%80%AA) | |
| 205 // U+202B RIGHT-TO-LEFT EMBEDDING (%E2%80%AB) | |
| 206 // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC) | |
| 207 // U+202D LEFT-TO-RIGHT OVERRIDE (%E2%80%AD) | |
| 208 // U+202E RIGHT-TO-LEFT OVERRIDE (%E2%80%AE) | |
| 209 // | |
| 210 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC | |
| 211 // 3987 above has since added some new BiDi control characters. | |
| 212 // http://www.unicode.org/reports/tr9 | |
| 213 // | |
| 214 // U+061C ARABIC LETTER MARK (%D8%9C) | |
| 215 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6) | |
| 216 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7) | |
| 217 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8) | |
| 218 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9) | |
| 219 // | |
| 220 // However, some schemes such as data: and file: need to parse the exact | |
| 221 // binary data when loading the URL. For that reason, CONTROL_CHARS allows | |
| 222 // unescaping BiDi control characters. | |
| 223 // DO NOT use CONTROL_CHARS if the parsed URL is going to be displayed | |
| 224 // in the UI. | |
| 225 if (!(rules & UnescapeRule::CONTROL_CHARS)) { | |
| 226 if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) { | |
| 227 // Keep Arabic Language Mark escaped. | |
| 228 result.append(escaped_text, i, 6); | |
| 229 i += 5; | |
| 230 continue; | |
| 231 } | |
| 232 if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) { | |
| 233 // Keep BiDi control char escaped. | |
| 234 result.append(escaped_text, i, 9); | |
| 235 i += 8; | |
| 236 continue; | |
| 237 } | |
| 238 } | |
| 239 | |
| 240 if (first_byte >= 0x80 || // Unescape all high-bit characters. | |
| 241 // For 7-bit characters, the lookup table tells us all valid chars. | |
| 242 (kUrlUnescape[first_byte] || | |
| 243 // ...and we allow some additional unescaping when flags are set. | |
| 244 (first_byte == ' ' && (rules & UnescapeRule::SPACES)) || | |
| 245 // Allow any of the prohibited but non-control characters when | |
| 246 // we're doing "special" chars. | |
| 247 (first_byte > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || | |
| 248 // Additionally allow control characters if requested. | |
| 249 (first_byte < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { | |
| 250 // Use the unescaped version of the character. | |
| 251 if (adjustments) | |
| 252 adjustments->push_back(base::OffsetAdjuster::Adjustment(i, 3, 1)); | |
| 253 result.push_back(first_byte); | |
| 254 i += 2; | |
| 255 } else { | |
| 256 // Keep escaped. Append a percent and we'll get the following two | |
| 257 // digits on the next loops through. | |
| 258 result.push_back('%'); | |
| 259 } | |
| 260 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) && | |
| 261 escaped_text[i] == '+') { | |
| 262 result.push_back(' '); | |
| 263 } else { | |
| 264 // Normal case for unescaped characters. | |
| 265 result.push_back(escaped_text[i]); | |
| 266 } | |
| 267 } | |
| 268 | |
| 269 return result; | |
| 270 } | |
| 271 | |
| 272 template <class str> | |
| 273 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) { | |
| 274 static const struct { | |
| 275 char key; | |
| 276 const char* replacement; | |
| 277 } kCharsToEscape[] = { | |
| 278 { '<', "<" }, | |
| 279 { '>', ">" }, | |
| 280 { '&', "&" }, | |
| 281 { '"', """ }, | |
| 282 { '\'', "'" }, | |
| 283 }; | |
| 284 size_t k; | |
| 285 for (k = 0; k < arraysize(kCharsToEscape); ++k) { | |
| 286 if (c == kCharsToEscape[k].key) { | |
| 287 const char* p = kCharsToEscape[k].replacement; | |
| 288 while (*p) | |
| 289 output->push_back(*p++); | |
| 290 break; | |
| 291 } | |
| 292 } | |
| 293 if (k == arraysize(kCharsToEscape)) | |
| 294 output->push_back(c); | |
| 295 } | |
| 296 | |
| 297 template <class str> | |
| 298 str EscapeForHTMLImpl(const str& input) { | |
| 299 str result; | |
| 300 result.reserve(input.size()); // Optimize for no escaping. | |
| 301 | |
| 302 for (typename str::const_iterator i = input.begin(); i != input.end(); ++i) | |
| 303 AppendEscapedCharForHTMLImpl(*i, &result); | |
| 304 | |
| 305 return result; | |
| 306 } | |
| 307 | |
| 308 // Everything except alphanumerics and !'()*-._~ | |
| 309 // See RFC 2396 for the list of reserved characters. | |
| 310 static const Charmap kQueryCharmap = {{ | |
| 311 0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L, | |
| 312 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | |
| 313 }}; | |
| 314 | |
| 315 // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|} | |
| 316 static const Charmap kPathCharmap = {{ | |
| 317 0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L, | |
| 318 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | |
| 319 }}; | |
| 320 | |
| 321 #if defined(OS_MACOSX) | |
| 322 // non-printable, non-7bit, and (including space) "#%<>[\]^`{|} | |
| 323 static const Charmap kNSURLCharmap = {{ | |
| 324 0xffffffffL, 0x5000002dL, 0x78000000L, 0xb8000001L, | |
| 325 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | |
| 326 }}; | |
| 327 #endif // defined(OS_MACOSX) | |
| 328 | |
| 329 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|} | |
| 330 static const Charmap kUrlEscape = {{ | |
| 331 0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L, | |
| 332 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | |
| 333 }}; | |
| 334 | |
| 335 // non-7bit | |
| 336 static const Charmap kNonASCIICharmap = {{ | |
| 337 0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L, | |
| 338 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | |
| 339 }}; | |
| 340 | |
| 341 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and | |
| 342 // !'()*-._~#[] | |
| 343 static const Charmap kExternalHandlerCharmap = {{ | |
| 344 0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L, | |
| 345 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL | |
| 346 }}; | |
| 347 | |
| 348 } // namespace | |
| 349 | |
| 350 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) { | |
| 351 return Escape(text, kQueryCharmap, use_plus); | |
| 352 } | |
| 353 | |
| 354 std::string EscapePath(const std::string& path) { | |
| 355 return Escape(path, kPathCharmap, false); | |
| 356 } | |
| 357 | |
| 358 #if defined(OS_MACOSX) | |
| 359 std::string EscapeNSURLPrecursor(const std::string& precursor) { | |
| 360 return Escape(precursor, kNSURLCharmap, false, true); | |
| 361 } | |
| 362 #endif // defined(OS_MACOSX) | |
| 363 | |
| 364 std::string EscapeUrlEncodedData(const std::string& path, bool use_plus) { | |
| 365 return Escape(path, kUrlEscape, use_plus); | |
| 366 } | |
| 367 | |
| 368 std::string EscapeNonASCII(const std::string& input) { | |
| 369 return Escape(input, kNonASCIICharmap, false); | |
| 370 } | |
| 371 | |
| 372 std::string EscapeExternalHandlerValue(const std::string& text) { | |
| 373 return Escape(text, kExternalHandlerCharmap, false, true); | |
| 374 } | |
| 375 | |
| 376 void AppendEscapedCharForHTML(char c, std::string* output) { | |
| 377 AppendEscapedCharForHTMLImpl(c, output); | |
| 378 } | |
| 379 | |
| 380 std::string EscapeForHTML(const std::string& input) { | |
| 381 return EscapeForHTMLImpl(input); | |
| 382 } | |
| 383 | |
| 384 base::string16 EscapeForHTML(const base::string16& input) { | |
| 385 return EscapeForHTMLImpl(input); | |
| 386 } | |
| 387 | |
| 388 std::string UnescapeURLComponent(const std::string& escaped_text, | |
| 389 UnescapeRule::Type rules) { | |
| 390 return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, NULL); | |
| 391 } | |
| 392 | |
| 393 base::string16 UnescapeURLComponent(const base::string16& escaped_text, | |
| 394 UnescapeRule::Type rules) { | |
| 395 return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, NULL); | |
| 396 } | |
| 397 | |
| 398 base::string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text, | |
| 399 UnescapeRule::Type rules) { | |
| 400 return UnescapeAndDecodeUTF8URLComponentWithAdjustments(text, rules, NULL); | |
| 401 } | |
| 402 | |
| 403 base::string16 UnescapeAndDecodeUTF8URLComponentWithAdjustments( | |
| 404 const std::string& text, | |
| 405 UnescapeRule::Type rules, | |
| 406 base::OffsetAdjuster::Adjustments* adjustments) { | |
| 407 base::string16 result; | |
| 408 base::OffsetAdjuster::Adjustments unescape_adjustments; | |
| 409 std::string unescaped_url(UnescapeURLWithAdjustmentsImpl( | |
| 410 text, rules, &unescape_adjustments)); | |
| 411 if (base::UTF8ToUTF16WithAdjustments(unescaped_url.data(), | |
| 412 unescaped_url.length(), | |
| 413 &result, adjustments)) { | |
| 414 // Character set looks like it's valid. | |
| 415 if (adjustments) { | |
| 416 base::OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments, | |
| 417 adjustments); | |
| 418 } | |
| 419 return result; | |
| 420 } | |
| 421 // Character set is not valid. Return the escaped version. | |
| 422 return base::UTF8ToUTF16WithAdjustments(text, adjustments); | |
| 423 } | |
| 424 | |
| 425 base::string16 UnescapeForHTML(const base::string16& input) { | |
| 426 static const struct { | |
| 427 const char* ampersand_code; | |
| 428 const char replacement; | |
| 429 } kEscapeToChars[] = { | |
| 430 { "<", '<' }, | |
| 431 { ">", '>' }, | |
| 432 { "&", '&' }, | |
| 433 { """, '"' }, | |
| 434 { "'", '\''}, | |
| 435 }; | |
| 436 | |
| 437 if (input.find(base::ASCIIToUTF16("&")) == std::string::npos) | |
| 438 return input; | |
| 439 | |
| 440 base::string16 ampersand_chars[arraysize(kEscapeToChars)]; | |
| 441 base::string16 text(input); | |
| 442 for (base::string16::iterator iter = text.begin(); | |
| 443 iter != text.end(); ++iter) { | |
| 444 if (*iter == '&') { | |
| 445 // Potential ampersand encode char. | |
| 446 size_t index = iter - text.begin(); | |
| 447 for (size_t i = 0; i < arraysize(kEscapeToChars); i++) { | |
| 448 if (ampersand_chars[i].empty()) { | |
| 449 ampersand_chars[i] = | |
| 450 base::ASCIIToUTF16(kEscapeToChars[i].ampersand_code); | |
| 451 } | |
| 452 if (text.find(ampersand_chars[i], index) == index) { | |
| 453 text.replace(iter, iter + ampersand_chars[i].length(), | |
| 454 1, kEscapeToChars[i].replacement); | |
| 455 break; | |
| 456 } | |
| 457 } | |
| 458 } | |
| 459 } | |
| 460 return text; | |
| 461 } | |
| 462 | |
| 463 } // namespace net | |
| OLD | NEW |