Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(434)

Side by Side Diff: net/base/escape.cc

Issue 181483008: Don't unescape BiDi control characters in URL components (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Addressed comments - 3 Created 6 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | net/base/escape_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "net/base/escape.h" 5 #include "net/base/escape.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 8
9 #include "base/logging.h" 9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h" 10 #include "base/memory/scoped_ptr.h"
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after
90 // @ A B C D E F G H I J K L M N O 90 // @ A B C D E F G H I J K L M N O
91 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 91 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92 // P Q R S T U V W X Y Z [ \ ] ^ _ 92 // P Q R S T U V W X Y Z [ \ ] ^ _
93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
94 // ` a b c d e f g h i j k l m n o 94 // ` a b c d e f g h i j k l m n o
95 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 95 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
96 // p q r s t u v w x y z { | } ~ <NBSP> 96 // p q r s t u v w x y z { | } ~ <NBSP>
97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0
98 }; 98 };
99 99
100 // Attempts to unescape the sequence at |index| within |escaped_text|. If
101 // successful, sets |value| to the unescaped value. Returns whether
102 // unescaping succeeded.
103 template<typename STR>
104 bool UnescapeUnsignedCharAtIndex(const STR& escaped_text,
105 size_t index,
106 unsigned char* value) {
107 if ((index + 2) >= escaped_text.size())
108 return false;
109 if (escaped_text[index] != '%')
110 return false;
111 const typename STR::value_type most_sig_digit(
112 static_cast<typename STR::value_type>(escaped_text[index + 1]));
113 const typename STR::value_type least_sig_digit(
114 static_cast<typename STR::value_type>(escaped_text[index + 2]));
115 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
116 *value = HexDigitToInt(most_sig_digit) * 16 +
117 HexDigitToInt(least_sig_digit);
118 return true;
119 }
120 return false;
121 }
122
100 template<typename STR> 123 template<typename STR>
101 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text, 124 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,
102 UnescapeRule::Type rules, 125 UnescapeRule::Type rules,
103 std::vector<size_t>* offsets_for_adjustment) { 126 std::vector<size_t>* offsets_for_adjustment) {
104 if (offsets_for_adjustment) { 127 if (offsets_for_adjustment) {
105 std::for_each(offsets_for_adjustment->begin(), 128 std::for_each(offsets_for_adjustment->begin(),
106 offsets_for_adjustment->end(), 129 offsets_for_adjustment->end(),
107 base::LimitOffset<STR>(escaped_text.length())); 130 base::LimitOffset<STR>(escaped_text.length()));
108 } 131 }
109 // Do not unescape anything, return the |escaped_text| text. 132 // Do not unescape anything, return the |escaped_text| text.
110 if (rules == UnescapeRule::NONE) 133 if (rules == UnescapeRule::NONE)
111 return escaped_text; 134 return escaped_text;
112 135
113 // The output of the unescaping is always smaller than the input, so we can 136 // The output of the unescaping is always smaller than the input, so we can
114 // reserve the input size to make sure we have enough buffer and don't have 137 // reserve the input size to make sure we have enough buffer and don't have
115 // to allocate in the loop below. 138 // to allocate in the loop below.
116 STR result; 139 STR result;
117 result.reserve(escaped_text.length()); 140 result.reserve(escaped_text.length());
118 141
119 // Locations of adjusted text. 142 // Locations of adjusted text.
120 net::internal::AdjustEncodingOffset::Adjustments adjustments; 143 net::internal::AdjustEncodingOffset::Adjustments adjustments;
121 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) { 144 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
122 if (static_cast<unsigned char>(escaped_text[i]) >= 128) { 145 if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
123 // Non ASCII character, append as is. 146 // Non ASCII character, append as is.
124 result.push_back(escaped_text[i]); 147 result.push_back(escaped_text[i]);
125 continue; 148 continue;
126 } 149 }
127 150
128 char current_char = static_cast<char>(escaped_text[i]); 151 unsigned char first_byte;
129 if (current_char == '%' && i + 2 < max) { 152 if (UnescapeUnsignedCharAtIndex(escaped_text, i, &first_byte)) {
130 const typename STR::value_type most_sig_digit( 153 // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi
131 static_cast<typename STR::value_type>(escaped_text[i + 1])); 154 // control characters are not allowed to appear unescaped in URLs:
132 const typename STR::value_type least_sig_digit( 155 //
133 static_cast<typename STR::value_type>(escaped_text[i + 2])); 156 // U+200E LEFT-TO-RIGHT MARK (%E2%80%8E)
134 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { 157 // U+200F RIGHT-TO-LEFT MARK (%E2%80%8F)
135 unsigned char value = HexDigitToInt(most_sig_digit) * 16 + 158 // U+202A LEFT-TO-RIGHT EMBEDDING (%E2%80%AA)
136 HexDigitToInt(least_sig_digit); 159 // U+202B RIGHT-TO-LEFT EMBEDDING (%E2%80%AB)
137 if (value >= 0x80 || // Unescape all high-bit characters. 160 // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC)
138 // For 7-bit characters, the lookup table tells us all valid chars. 161 // U+202D LEFT-TO-RIGHT OVERRIDE (%E2%80%AD)
139 (kUrlUnescape[value] || 162 // U+202E RIGHT-TO-LEFT OVERRIDE (%E2%80%AE)
140 // ...and we allow some additional unescaping when flags are set. 163 //
141 (value == ' ' && (rules & UnescapeRule::SPACES)) || 164 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC
142 // Allow any of the prohibited but non-control characters when 165 // 3987 above has since added some new BiDi control characters.
143 // we're doing "special" chars. 166 // http://www.unicode.org/reports/tr9
144 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || 167 //
145 // Additionally allow control characters if requested. 168 // U+061C ARABIC LETTER MARK (%D8%9C)
146 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { 169 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)
147 // Use the unescaped version of the character. 170 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)
148 adjustments.push_back(i); 171 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8)
149 result.push_back(value); 172 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)
150 i += 2; 173
151 } else { 174 unsigned char second_byte;
152 // Keep escaped. Append a percent and we'll get the following two 175 // Check for ALM.
153 // digits on the next loops through. 176 if ((first_byte == 0xD8) &&
154 result.push_back('%'); 177 UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) &&
178 (second_byte == 0x9c)) {
179 result.append(escaped_text, i, 6);
180 i += 5;
181 continue;
182 }
183
184 // Check for other BiDi control characters.
185 if ((first_byte == 0xE2) &&
186 UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) &&
187 ((second_byte == 0x80) || (second_byte == 0x81))) {
188 unsigned char third_byte;
189 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &third_byte) &&
190 ((second_byte == 0x80) ?
191 ((third_byte == 0x8E) || (third_byte == 0x8F) ||
192 ((third_byte >= 0xAA) && (third_byte <= 0xAE))) :
193 ((third_byte >= 0xA6) && (third_byte <= 0xA9)))) {
194 result.append(escaped_text, i, 9);
195 i += 8;
196 continue;
155 } 197 }
198 }
199
200 if (first_byte >= 0x80 || // Unescape all high-bit characters.
201 // For 7-bit characters, the lookup table tells us all valid chars.
202 (kUrlUnescape[first_byte] ||
203 // ...and we allow some additional unescaping when flags are set.
204 (first_byte == ' ' && (rules & UnescapeRule::SPACES)) ||
205 // Allow any of the prohibited but non-control characters when
206 // we're doing "special" chars.
207 (first_byte > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
208 // Additionally allow control characters if requested.
209 (first_byte < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
210 // Use the unescaped version of the character.
211 adjustments.push_back(i);
212 result.push_back(first_byte);
213 i += 2;
156 } else { 214 } else {
157 // Invalid escape sequence, just pass the percent through and continue 215 // Keep escaped. Append a percent and we'll get the following two
158 // right after it. 216 // digits on the next loops through.
159 result.push_back('%'); 217 result.push_back('%');
160 } 218 }
161 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) && 219 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
162 escaped_text[i] == '+') { 220 escaped_text[i] == '+') {
163 result.push_back(' '); 221 result.push_back(' ');
164 } else { 222 } else {
165 // Normal case for unescaped characters. 223 // Normal case for unescaped characters.
166 result.push_back(escaped_text[i]); 224 result.push_back(escaped_text[i]);
167 } 225 }
168 } 226 }
(...skipping 216 matching lines...) Expand 10 before | Expand all | Expand 10 after
385 return; 443 return;
386 } 444 }
387 adjusted_offset -= 2; 445 adjusted_offset -= 2;
388 } 446 }
389 offset = adjusted_offset; 447 offset = adjusted_offset;
390 } 448 }
391 449
392 } // namespace internal 450 } // namespace internal
393 451
394 } // namespace net 452 } // namespace net
OLDNEW
« no previous file with comments | « no previous file | net/base/escape_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698