Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(184)

Side by Side Diff: net/base/escape.cc

Issue 181483008: Don't unescape BiDi control characters in URL components (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Addressed comments Created 6 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "net/base/escape.h" 5 #include "net/base/escape.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 8
9 #include "base/logging.h" 9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h" 10 #include "base/memory/scoped_ptr.h"
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after
90 // @ A B C D E F G H I J K L M N O 90 // @ A B C D E F G H I J K L M N O
91 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 91 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92 // P Q R S T U V W X Y Z [ \ ] ^ _ 92 // P Q R S T U V W X Y Z [ \ ] ^ _
93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
94 // ` a b c d e f g h i j k l m n o 94 // ` a b c d e f g h i j k l m n o
95 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 95 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
96 // p q r s t u v w x y z { | } ~ <NBSP> 96 // p q r s t u v w x y z { | } ~ <NBSP>
97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0
98 }; 98 };
99 99
100 // Attempts to unescape the sequence at |index| within |escaped_text|. If
101 // successful, sets |value| to the unescaped value. Returns whether
102 // unescaping succeeded.
103 template<typename STR>
104 bool UnescapeUnsignedCharAtIndex(const STR& escaped_text,
105 size_t index,
106 unsigned char* value) {
107 if (index + 2 < escaped_text.size()) {
Peter Kasting 2014/02/27 21:37:09 Nit: Use an early-return here, as well as on the I
Anuj 2014/02/27 21:55:35 Done.
108 char current_char = static_cast<char>(escaped_text[index]);
Peter Kasting 2014/02/27 21:37:09 Nit: Inline into the next statement.
Anuj 2014/02/27 21:55:35 Done.
109 if (current_char != '%')
110 return false;
111 const typename STR::value_type most_sig_digit(
112 static_cast<typename STR::value_type>(escaped_text[index + 1]));
113 const typename STR::value_type least_sig_digit(
114 static_cast<typename STR::value_type>(escaped_text[index + 2]));
115 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
116 *value = HexDigitToInt(most_sig_digit) * 16 +
117 HexDigitToInt(least_sig_digit);
118 return true;
119 }
120 }
121 return false;
122 }
123
100 template<typename STR> 124 template<typename STR>
101 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text, 125 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,
102 UnescapeRule::Type rules, 126 UnescapeRule::Type rules,
103 std::vector<size_t>* offsets_for_adjustment) { 127 std::vector<size_t>* offsets_for_adjustment) {
104 if (offsets_for_adjustment) { 128 if (offsets_for_adjustment) {
105 std::for_each(offsets_for_adjustment->begin(), 129 std::for_each(offsets_for_adjustment->begin(),
106 offsets_for_adjustment->end(), 130 offsets_for_adjustment->end(),
107 base::LimitOffset<STR>(escaped_text.length())); 131 base::LimitOffset<STR>(escaped_text.length()));
108 } 132 }
109 // Do not unescape anything, return the |escaped_text| text. 133 // Do not unescape anything, return the |escaped_text| text.
110 if (rules == UnescapeRule::NONE) 134 if (rules == UnescapeRule::NONE)
111 return escaped_text; 135 return escaped_text;
112 136
113 // The output of the unescaping is always smaller than the input, so we can 137 // The output of the unescaping is always smaller than the input, so we can
114 // reserve the input size to make sure we have enough buffer and don't have 138 // reserve the input size to make sure we have enough buffer and don't have
115 // to allocate in the loop below. 139 // to allocate in the loop below.
116 STR result; 140 STR result;
117 result.reserve(escaped_text.length()); 141 result.reserve(escaped_text.length());
118 142
119 // Locations of adjusted text. 143 // Locations of adjusted text.
120 net::internal::AdjustEncodingOffset::Adjustments adjustments; 144 net::internal::AdjustEncodingOffset::Adjustments adjustments;
121 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) { 145 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
122 if (static_cast<unsigned char>(escaped_text[i]) >= 128) { 146 if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
123 // Non ASCII character, append as is. 147 // Non ASCII character, append as is.
124 result.push_back(escaped_text[i]); 148 result.push_back(escaped_text[i]);
125 continue; 149 continue;
126 } 150 }
127 151
128 char current_char = static_cast<char>(escaped_text[i]); 152 unsigned char value;
129 if (current_char == '%' && i + 2 < max) { 153 if (UnescapeUnsignedCharAtIndex(escaped_text, i, &value)) {
130 const typename STR::value_type most_sig_digit( 154 // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi
131 static_cast<typename STR::value_type>(escaped_text[i + 1])); 155 // control characters are not allowed to appear unescaped in URLs:
132 const typename STR::value_type least_sig_digit( 156 //
133 static_cast<typename STR::value_type>(escaped_text[i + 2])); 157 // U+200E LEFT-TO-RIGHT MARK (%E2%80%8E)
134 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { 158 // U+200F RIGHT-TO-LEFT MARK (%E2%80%8F)
135 unsigned char value = HexDigitToInt(most_sig_digit) * 16 + 159 // U+202A LEFT-TO-RIGHT EMBEDDING (%E2%80%AA)
136 HexDigitToInt(least_sig_digit); 160 // U+202B RIGHT-TO-LEFT EMBEDDING (%E2%80%AB)
137 if (value >= 0x80 || // Unescape all high-bit characters. 161 // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC)
138 // For 7-bit characters, the lookup table tells us all valid chars. 162 // U+202D LEFT-TO-RIGHT OVERRIDE (%E2%80%AD)
139 (kUrlUnescape[value] || 163 // U+202E RIGHT-TO-LEFT OVERRIDE (%E2%80%AE)
140 // ...and we allow some additional unescaping when flags are set. 164 //
141 (value == ' ' && (rules & UnescapeRule::SPACES)) || 165 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC
142 // Allow any of the prohibited but non-control characters when 166 // 3987 above has since added some new BiDi control characters.
143 // we're doing "special" chars. 167 // http://www.unicode.org/reports/tr9
144 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || 168 //
145 // Additionally allow control characters if requested. 169 // U+061C ARABIC LETTER MARK (%D8%9C)
146 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { 170 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)
147 // Use the unescaped version of the character. 171 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)
148 adjustments.push_back(i); 172 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8)
149 result.push_back(value); 173 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)
150 i += 2; 174
151 } else { 175 unsigned char tmp_value1;
Peter Kasting 2014/02/27 21:37:09 Nit: Just call this "temp" and use it for both tmp
Anuj 2014/02/27 21:55:35 Done.
152 // Keep escaped. Append a percent and we'll get the following two 176 if (value == 0xD8) {
Peter Kasting 2014/02/27 21:37:09 Nit: Combine conditionals here and hoist the comme
Anuj 2014/02/27 21:55:35 Done.
153 // digits on the next loops through. 177 // Possible Arabic Letter Mark.
154 result.push_back('%'); 178 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &tmp_value1) &&
179 (tmp_value1 == 0x9c)) {
180 result.append(escaped_text, i, 6);
Peter Kasting 2014/02/27 21:37:09 Nit: Indented too far
Anuj 2014/02/27 21:55:35 Done.
181 i += 5;
182 continue;
155 } 183 }
184 }
185
186 if (value == 0xE2) {
Peter Kasting 2014/02/27 21:37:09 Nit: Similarly: // Check for other BiDi con
Anuj 2014/02/27 21:55:35 I will pass on this change. I think compiler shoul
Peter Kasting 2014/02/27 22:08:38 It's not a question of optimized code, it's a ques
Anuj 2014/02/27 22:24:51 I made the exact same change just as you sent this
187 // Possible BiDi control character.
188 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &tmp_value1)) {
189 if ((tmp_value1 == 0x80) || (tmp_value1 == 0x81)) {
190 unsigned char tmp_value2;
191 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &tmp_value2)) {
192 // Embeddings, Overrides and Marks have second byte 0x80.
193 bool is_bidi_control_char = ((tmp_value1 == 0x80) &&
194 ((tmp_value2 == 0x8E) || (tmp_value2 == 0x8F) ||
195 ((tmp_value2 >= 0xAA) && (tmp_value2 <= 0xAE))));
196
197 // Isolates have second byte 0x81.
198 is_bidi_control_char |= ((tmp_value1 == 0x81) &&
199 ((tmp_value2 >= 0xA6) && (tmp_value2 <= 0xA9)));
200 if (is_bidi_control_char) {
201 result.append(escaped_text, i, 9);
202 i += 8;
203 continue;
204 }
205 }
206 }
207 }
208 }
209
210 if (value >= 0x80 || // Unescape all high-bit characters.
211 // For 7-bit characters, the lookup table tells us all valid chars.
212 (kUrlUnescape[value] ||
213 // ...and we allow some additional unescaping when flags are set.
214 (value == ' ' && (rules & UnescapeRule::SPACES)) ||
215 // Allow any of the prohibited but non-control characters when
216 // we're doing "special" chars.
217 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
218 // Additionally allow control characters if requested.
219 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
220 // Use the unescaped version of the character.
221 adjustments.push_back(i);
222 result.push_back(value);
223 i += 2;
156 } else { 224 } else {
157 // Invalid escape sequence, just pass the percent through and continue 225 // Keep escaped. Append a percent and we'll get the following two
158 // right after it. 226 // digits on the next loops through.
159 result.push_back('%'); 227 result.push_back('%');
160 } 228 }
161 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) && 229 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
162 escaped_text[i] == '+') { 230 escaped_text[i] == '+') {
163 result.push_back(' '); 231 result.push_back(' ');
164 } else { 232 } else {
165 // Normal case for unescaped characters. 233 // Normal case for unescaped characters.
166 result.push_back(escaped_text[i]); 234 result.push_back(escaped_text[i]);
167 } 235 }
168 } 236 }
(...skipping 216 matching lines...) Expand 10 before | Expand all | Expand 10 after
385 return; 453 return;
386 } 454 }
387 adjusted_offset -= 2; 455 adjusted_offset -= 2;
388 } 456 }
389 offset = adjusted_offset; 457 offset = adjusted_offset;
390 } 458 }
391 459
392 } // namespace internal 460 } // namespace internal
393 461
394 } // namespace net 462 } // namespace net
OLDNEW
« base/i18n/rtl.h ('K') | « base/i18n/rtl.h ('k') | net/base/escape_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698