| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "net/base/escape.h" | 5 #include "net/base/escape.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 | 8 |
| 9 #include "base/logging.h" | 9 #include "base/logging.h" |
| 10 #include "base/memory/scoped_ptr.h" | 10 #include "base/memory/scoped_ptr.h" |
| (...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 90 // @ A B C D E F G H I J K L M N O | 90 // @ A B C D E F G H I J K L M N O |
| 91 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 91 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 92 // P Q R S T U V W X Y Z [ \ ] ^ _ | 92 // P Q R S T U V W X Y Z [ \ ] ^ _ |
| 93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, | 93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, |
| 94 // ` a b c d e f g h i j k l m n o | 94 // ` a b c d e f g h i j k l m n o |
| 95 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 95 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 96 // p q r s t u v w x y z { | } ~ <NBSP> | 96 // p q r s t u v w x y z { | } ~ <NBSP> |
| 97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 | 97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 |
| 98 }; | 98 }; |
| 99 | 99 |
| 100 // Attempts to unescape the sequence at |index| within |escaped_text|. If |
| 101 // successful, sets |value| to the unescaped value. Returns whether |
| 102 // unescaping succeeded. |
| 103 template<typename STR> |
| 104 bool UnescapeUnsignedCharAtIndex(const STR& escaped_text, |
| 105 size_t index, |
| 106 unsigned char* value) { |
| 107 if ((index + 2) >= escaped_text.size()) |
| 108 return false; |
| 109 if (escaped_text[index] != '%') |
| 110 return false; |
| 111 const typename STR::value_type most_sig_digit( |
| 112 static_cast<typename STR::value_type>(escaped_text[index + 1])); |
| 113 const typename STR::value_type least_sig_digit( |
| 114 static_cast<typename STR::value_type>(escaped_text[index + 2])); |
| 115 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { |
| 116 *value = HexDigitToInt(most_sig_digit) * 16 + |
| 117 HexDigitToInt(least_sig_digit); |
| 118 return true; |
| 119 } |
| 120 return false; |
| 121 } |
| 122 |
| 100 template<typename STR> | 123 template<typename STR> |
| 101 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text, | 124 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text, |
| 102 UnescapeRule::Type rules, | 125 UnescapeRule::Type rules, |
| 103 std::vector<size_t>* offsets_for_adjustment) { | 126 std::vector<size_t>* offsets_for_adjustment) { |
| 104 if (offsets_for_adjustment) { | 127 if (offsets_for_adjustment) { |
| 105 std::for_each(offsets_for_adjustment->begin(), | 128 std::for_each(offsets_for_adjustment->begin(), |
| 106 offsets_for_adjustment->end(), | 129 offsets_for_adjustment->end(), |
| 107 base::LimitOffset<STR>(escaped_text.length())); | 130 base::LimitOffset<STR>(escaped_text.length())); |
| 108 } | 131 } |
| 109 // Do not unescape anything, return the |escaped_text| text. | 132 // Do not unescape anything, return the |escaped_text| text. |
| 110 if (rules == UnescapeRule::NONE) | 133 if (rules == UnescapeRule::NONE) |
| 111 return escaped_text; | 134 return escaped_text; |
| 112 | 135 |
| 113 // The output of the unescaping is always smaller than the input, so we can | 136 // The output of the unescaping is always smaller than the input, so we can |
| 114 // reserve the input size to make sure we have enough buffer and don't have | 137 // reserve the input size to make sure we have enough buffer and don't have |
| 115 // to allocate in the loop below. | 138 // to allocate in the loop below. |
| 116 STR result; | 139 STR result; |
| 117 result.reserve(escaped_text.length()); | 140 result.reserve(escaped_text.length()); |
| 118 | 141 |
| 119 // Locations of adjusted text. | 142 // Locations of adjusted text. |
| 120 net::internal::AdjustEncodingOffset::Adjustments adjustments; | 143 net::internal::AdjustEncodingOffset::Adjustments adjustments; |
| 121 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) { | 144 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) { |
| 122 if (static_cast<unsigned char>(escaped_text[i]) >= 128) { | 145 if (static_cast<unsigned char>(escaped_text[i]) >= 128) { |
| 123 // Non ASCII character, append as is. | 146 // Non ASCII character, append as is. |
| 124 result.push_back(escaped_text[i]); | 147 result.push_back(escaped_text[i]); |
| 125 continue; | 148 continue; |
| 126 } | 149 } |
| 127 | 150 |
| 128 char current_char = static_cast<char>(escaped_text[i]); | 151 unsigned char first_byte; |
| 129 if (current_char == '%' && i + 2 < max) { | 152 if (UnescapeUnsignedCharAtIndex(escaped_text, i, &first_byte)) { |
| 130 const typename STR::value_type most_sig_digit( | 153 // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi |
| 131 static_cast<typename STR::value_type>(escaped_text[i + 1])); | 154 // control characters are not allowed to appear unescaped in URLs: |
| 132 const typename STR::value_type least_sig_digit( | 155 // |
| 133 static_cast<typename STR::value_type>(escaped_text[i + 2])); | 156 // U+200E LEFT-TO-RIGHT MARK (%E2%80%8E) |
| 134 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { | 157 // U+200F RIGHT-TO-LEFT MARK (%E2%80%8F) |
| 135 unsigned char value = HexDigitToInt(most_sig_digit) * 16 + | 158 // U+202A LEFT-TO-RIGHT EMBEDDING (%E2%80%AA) |
| 136 HexDigitToInt(least_sig_digit); | 159 // U+202B RIGHT-TO-LEFT EMBEDDING (%E2%80%AB) |
| 137 if (value >= 0x80 || // Unescape all high-bit characters. | 160 // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC) |
| 138 // For 7-bit characters, the lookup table tells us all valid chars. | 161 // U+202D LEFT-TO-RIGHT OVERRIDE (%E2%80%AD) |
| 139 (kUrlUnescape[value] || | 162 // U+202E RIGHT-TO-LEFT OVERRIDE (%E2%80%AE) |
| 140 // ...and we allow some additional unescaping when flags are set. | 163 // |
| 141 (value == ' ' && (rules & UnescapeRule::SPACES)) || | 164 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC |
| 142 // Allow any of the prohibited but non-control characters when | 165 // 3987 above has since added some new BiDi control characters. |
| 143 // we're doing "special" chars. | 166 // http://www.unicode.org/reports/tr9 |
| 144 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || | 167 // |
| 145 // Additionally allow control characters if requested. | 168 // U+061C ARABIC LETTER MARK (%D8%9C) |
| 146 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { | 169 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6) |
| 147 // Use the unescaped version of the character. | 170 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7) |
| 148 adjustments.push_back(i); | 171 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8) |
| 149 result.push_back(value); | 172 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9) |
| 150 i += 2; | 173 |
| 151 } else { | 174 unsigned char second_byte; |
| 152 // Keep escaped. Append a percent and we'll get the following two | 175 // Check for ALM. |
| 153 // digits on the next loops through. | 176 if ((first_byte == 0xD8) && |
| 154 result.push_back('%'); | 177 UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) && |
| 178 (second_byte == 0x9c)) { |
| 179 result.append(escaped_text, i, 6); |
| 180 i += 5; |
| 181 continue; |
| 182 } |
| 183 |
| 184 // Check for other BiDi control characters. |
| 185 if ((first_byte == 0xE2) && |
| 186 UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) && |
| 187 ((second_byte == 0x80) || (second_byte == 0x81))) { |
| 188 unsigned char third_byte; |
| 189 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &third_byte) && |
| 190 ((second_byte == 0x80) ? |
| 191 ((third_byte == 0x8E) || (third_byte == 0x8F) || |
| 192 ((third_byte >= 0xAA) && (third_byte <= 0xAE))) : |
| 193 ((third_byte >= 0xA6) && (third_byte <= 0xA9)))) { |
| 194 result.append(escaped_text, i, 9); |
| 195 i += 8; |
| 196 continue; |
| 155 } | 197 } |
| 198 } |
| 199 |
| 200 if (first_byte >= 0x80 || // Unescape all high-bit characters. |
| 201 // For 7-bit characters, the lookup table tells us all valid chars. |
| 202 (kUrlUnescape[first_byte] || |
| 203 // ...and we allow some additional unescaping when flags are set. |
| 204 (first_byte == ' ' && (rules & UnescapeRule::SPACES)) || |
| 205 // Allow any of the prohibited but non-control characters when |
| 206 // we're doing "special" chars. |
| 207 (first_byte > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || |
| 208 // Additionally allow control characters if requested. |
| 209 (first_byte < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { |
| 210 // Use the unescaped version of the character. |
| 211 adjustments.push_back(i); |
| 212 result.push_back(first_byte); |
| 213 i += 2; |
| 156 } else { | 214 } else { |
| 157 // Invalid escape sequence, just pass the percent through and continue | 215 // Keep escaped. Append a percent and we'll get the following two |
| 158 // right after it. | 216 // digits on the next loops through. |
| 159 result.push_back('%'); | 217 result.push_back('%'); |
| 160 } | 218 } |
| 161 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) && | 219 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) && |
| 162 escaped_text[i] == '+') { | 220 escaped_text[i] == '+') { |
| 163 result.push_back(' '); | 221 result.push_back(' '); |
| 164 } else { | 222 } else { |
| 165 // Normal case for unescaped characters. | 223 // Normal case for unescaped characters. |
| 166 result.push_back(escaped_text[i]); | 224 result.push_back(escaped_text[i]); |
| 167 } | 225 } |
| 168 } | 226 } |
| (...skipping 216 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 385 return; | 443 return; |
| 386 } | 444 } |
| 387 adjusted_offset -= 2; | 445 adjusted_offset -= 2; |
| 388 } | 446 } |
| 389 offset = adjusted_offset; | 447 offset = adjusted_offset; |
| 390 } | 448 } |
| 391 | 449 |
| 392 } // namespace internal | 450 } // namespace internal |
| 393 | 451 |
| 394 } // namespace net | 452 } // namespace net |
| OLD | NEW |