Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "net/base/escape.h" | 5 #include "net/base/escape.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 | 8 |
| 9 #include "base/logging.h" | 9 #include "base/logging.h" |
| 10 #include "base/memory/scoped_ptr.h" | 10 #include "base/memory/scoped_ptr.h" |
| (...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 90 // @ A B C D E F G H I J K L M N O | 90 // @ A B C D E F G H I J K L M N O |
| 91 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 91 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 92 // P Q R S T U V W X Y Z [ \ ] ^ _ | 92 // P Q R S T U V W X Y Z [ \ ] ^ _ |
| 93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, | 93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, |
| 94 // ` a b c d e f g h i j k l m n o | 94 // ` a b c d e f g h i j k l m n o |
| 95 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 95 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 96 // p q r s t u v w x y z { | } ~ <NBSP> | 96 // p q r s t u v w x y z { | } ~ <NBSP> |
| 97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 | 97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 |
| 98 }; | 98 }; |
| 99 | 99 |
| 100 // Attempts to unescape the sequence at |index| within |escaped_text|. If | |
| 101 // successful, sets |value| to the unescaped value. Returns whether | |
| 102 // unescaping succeeded. | |
| 103 template<typename STR> | |
| 104 bool UnescapeUnsignedCharAtIndex(const STR& escaped_text, | |
| 105 size_t index, | |
| 106 unsigned char* value) { | |
| 107 if (index + 2 < escaped_text.size()) { | |
|
Peter Kasting
2014/02/27 21:37:09
Nit: Use an early-return here, as well as on the I
Anuj
2014/02/27 21:55:35
Done.
| |
| 108 char current_char = static_cast<char>(escaped_text[index]); | |
|
Peter Kasting
2014/02/27 21:37:09
Nit: Inline into the next statement.
Anuj
2014/02/27 21:55:35
Done.
| |
| 109 if (current_char != '%') | |
| 110 return false; | |
| 111 const typename STR::value_type most_sig_digit( | |
| 112 static_cast<typename STR::value_type>(escaped_text[index + 1])); | |
| 113 const typename STR::value_type least_sig_digit( | |
| 114 static_cast<typename STR::value_type>(escaped_text[index + 2])); | |
| 115 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { | |
| 116 *value = HexDigitToInt(most_sig_digit) * 16 + | |
| 117 HexDigitToInt(least_sig_digit); | |
| 118 return true; | |
| 119 } | |
| 120 } | |
| 121 return false; | |
| 122 } | |
| 123 | |
| 100 template<typename STR> | 124 template<typename STR> |
| 101 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text, | 125 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text, |
| 102 UnescapeRule::Type rules, | 126 UnescapeRule::Type rules, |
| 103 std::vector<size_t>* offsets_for_adjustment) { | 127 std::vector<size_t>* offsets_for_adjustment) { |
| 104 if (offsets_for_adjustment) { | 128 if (offsets_for_adjustment) { |
| 105 std::for_each(offsets_for_adjustment->begin(), | 129 std::for_each(offsets_for_adjustment->begin(), |
| 106 offsets_for_adjustment->end(), | 130 offsets_for_adjustment->end(), |
| 107 base::LimitOffset<STR>(escaped_text.length())); | 131 base::LimitOffset<STR>(escaped_text.length())); |
| 108 } | 132 } |
| 109 // Do not unescape anything, return the |escaped_text| text. | 133 // Do not unescape anything, return the |escaped_text| text. |
| 110 if (rules == UnescapeRule::NONE) | 134 if (rules == UnescapeRule::NONE) |
| 111 return escaped_text; | 135 return escaped_text; |
| 112 | 136 |
| 113 // The output of the unescaping is always smaller than the input, so we can | 137 // The output of the unescaping is always smaller than the input, so we can |
| 114 // reserve the input size to make sure we have enough buffer and don't have | 138 // reserve the input size to make sure we have enough buffer and don't have |
| 115 // to allocate in the loop below. | 139 // to allocate in the loop below. |
| 116 STR result; | 140 STR result; |
| 117 result.reserve(escaped_text.length()); | 141 result.reserve(escaped_text.length()); |
| 118 | 142 |
| 119 // Locations of adjusted text. | 143 // Locations of adjusted text. |
| 120 net::internal::AdjustEncodingOffset::Adjustments adjustments; | 144 net::internal::AdjustEncodingOffset::Adjustments adjustments; |
| 121 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) { | 145 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) { |
| 122 if (static_cast<unsigned char>(escaped_text[i]) >= 128) { | 146 if (static_cast<unsigned char>(escaped_text[i]) >= 128) { |
| 123 // Non ASCII character, append as is. | 147 // Non ASCII character, append as is. |
| 124 result.push_back(escaped_text[i]); | 148 result.push_back(escaped_text[i]); |
| 125 continue; | 149 continue; |
| 126 } | 150 } |
| 127 | 151 |
| 128 char current_char = static_cast<char>(escaped_text[i]); | 152 unsigned char value; |
| 129 if (current_char == '%' && i + 2 < max) { | 153 if (UnescapeUnsignedCharAtIndex(escaped_text, i, &value)) { |
| 130 const typename STR::value_type most_sig_digit( | 154 // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi |
| 131 static_cast<typename STR::value_type>(escaped_text[i + 1])); | 155 // control characters are not allowed to appear unescaped in URLs: |
| 132 const typename STR::value_type least_sig_digit( | 156 // |
| 133 static_cast<typename STR::value_type>(escaped_text[i + 2])); | 157 // U+200E LEFT-TO-RIGHT MARK (%E2%80%8E) |
| 134 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { | 158 // U+200F RIGHT-TO-LEFT MARK (%E2%80%8F) |
| 135 unsigned char value = HexDigitToInt(most_sig_digit) * 16 + | 159 // U+202A LEFT-TO-RIGHT EMBEDDING (%E2%80%AA) |
| 136 HexDigitToInt(least_sig_digit); | 160 // U+202B RIGHT-TO-LEFT EMBEDDING (%E2%80%AB) |
| 137 if (value >= 0x80 || // Unescape all high-bit characters. | 161 // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC) |
| 138 // For 7-bit characters, the lookup table tells us all valid chars. | 162 // U+202D LEFT-TO-RIGHT OVERRIDE (%E2%80%AD) |
| 139 (kUrlUnescape[value] || | 163 // U+202E RIGHT-TO-LEFT OVERRIDE (%E2%80%AE) |
| 140 // ...and we allow some additional unescaping when flags are set. | 164 // |
| 141 (value == ' ' && (rules & UnescapeRule::SPACES)) || | 165 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC |
| 142 // Allow any of the prohibited but non-control characters when | 166 // 3987 above has since added some new BiDi control characters. |
| 143 // we're doing "special" chars. | 167 // http://www.unicode.org/reports/tr9 |
| 144 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || | 168 // |
| 145 // Additionally allow control characters if requested. | 169 // U+061C ARABIC LETTER MARK (%D8%9C) |
| 146 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { | 170 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6) |
| 147 // Use the unescaped version of the character. | 171 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7) |
| 148 adjustments.push_back(i); | 172 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8) |
| 149 result.push_back(value); | 173 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9) |
| 150 i += 2; | 174 |
| 151 } else { | 175 unsigned char tmp_value1; |
|
Peter Kasting
2014/02/27 21:37:09
Nit: Just call this "temp" and use it for both tmp
Anuj
2014/02/27 21:55:35
Done.
| |
| 152 // Keep escaped. Append a percent and we'll get the following two | 176 if (value == 0xD8) { |
|
Peter Kasting
2014/02/27 21:37:09
Nit: Combine conditionals here and hoist the comme
Anuj
2014/02/27 21:55:35
Done.
| |
| 153 // digits on the next loops through. | 177 // Possible Arabic Letter Mark. |
| 154 result.push_back('%'); | 178 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &tmp_value1) && |
| 179 (tmp_value1 == 0x9c)) { | |
| 180 result.append(escaped_text, i, 6); | |
|
Peter Kasting
2014/02/27 21:37:09
Nit: Indented too far
Anuj
2014/02/27 21:55:35
Done.
| |
| 181 i += 5; | |
| 182 continue; | |
| 155 } | 183 } |
| 184 } | |
| 185 | |
| 186 if (value == 0xE2) { | |
|
Peter Kasting
2014/02/27 21:37:09
Nit: Similarly:
// Check for other BiDi con
Anuj
2014/02/27 21:55:35
I will pass on this change. I think compiler shoul
Peter Kasting
2014/02/27 22:08:38
It's not a question of optimized code, it's a ques
Anuj
2014/02/27 22:24:51
I made the exact same change just as you sent this
| |
| 187 // Possible BiDi control character. | |
| 188 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &tmp_value1)) { | |
| 189 if ((tmp_value1 == 0x80) || (tmp_value1 == 0x81)) { | |
| 190 unsigned char tmp_value2; | |
| 191 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &tmp_value2)) { | |
| 192 // Embeddings, Overrides and Marks have second byte 0x80. | |
| 193 bool is_bidi_control_char = ((tmp_value1 == 0x80) && | |
| 194 ((tmp_value2 == 0x8E) || (tmp_value2 == 0x8F) || | |
| 195 ((tmp_value2 >= 0xAA) && (tmp_value2 <= 0xAE)))); | |
| 196 | |
| 197 // Isolates have second byte 0x81. | |
| 198 is_bidi_control_char |= ((tmp_value1 == 0x81) && | |
| 199 ((tmp_value2 >= 0xA6) && (tmp_value2 <= 0xA9))); | |
| 200 if (is_bidi_control_char) { | |
| 201 result.append(escaped_text, i, 9); | |
| 202 i += 8; | |
| 203 continue; | |
| 204 } | |
| 205 } | |
| 206 } | |
| 207 } | |
| 208 } | |
| 209 | |
| 210 if (value >= 0x80 || // Unescape all high-bit characters. | |
| 211 // For 7-bit characters, the lookup table tells us all valid chars. | |
| 212 (kUrlUnescape[value] || | |
| 213 // ...and we allow some additional unescaping when flags are set. | |
| 214 (value == ' ' && (rules & UnescapeRule::SPACES)) || | |
| 215 // Allow any of the prohibited but non-control characters when | |
| 216 // we're doing "special" chars. | |
| 217 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || | |
| 218 // Additionally allow control characters if requested. | |
| 219 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { | |
| 220 // Use the unescaped version of the character. | |
| 221 adjustments.push_back(i); | |
| 222 result.push_back(value); | |
| 223 i += 2; | |
| 156 } else { | 224 } else { |
| 157 // Invalid escape sequence, just pass the percent through and continue | 225 // Keep escaped. Append a percent and we'll get the following two |
| 158 // right after it. | 226 // digits on the next loops through. |
| 159 result.push_back('%'); | 227 result.push_back('%'); |
| 160 } | 228 } |
| 161 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) && | 229 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) && |
| 162 escaped_text[i] == '+') { | 230 escaped_text[i] == '+') { |
| 163 result.push_back(' '); | 231 result.push_back(' '); |
| 164 } else { | 232 } else { |
| 165 // Normal case for unescaped characters. | 233 // Normal case for unescaped characters. |
| 166 result.push_back(escaped_text[i]); | 234 result.push_back(escaped_text[i]); |
| 167 } | 235 } |
| 168 } | 236 } |
| (...skipping 216 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 385 return; | 453 return; |
| 386 } | 454 } |
| 387 adjusted_offset -= 2; | 455 adjusted_offset -= 2; |
| 388 } | 456 } |
| 389 offset = adjusted_offset; | 457 offset = adjusted_offset; |
| 390 } | 458 } |
| 391 | 459 |
| 392 } // namespace internal | 460 } // namespace internal |
| 393 | 461 |
| 394 } // namespace net | 462 } // namespace net |
| OLD | NEW |