OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "net/base/escape.h" | 5 #include "net/base/escape.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 | 8 |
9 #include "base/logging.h" | 9 #include "base/logging.h" |
10 #include "base/memory/scoped_ptr.h" | 10 #include "base/memory/scoped_ptr.h" |
(...skipping 146 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
157 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte)) | 157 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte)) |
158 return false; | 158 return false; |
159 if (second_byte == 0x80) { | 159 if (second_byte == 0x80) { |
160 return third_byte == 0x8E || | 160 return third_byte == 0x8E || |
161 third_byte == 0x8F || | 161 third_byte == 0x8F || |
162 (third_byte >= 0xAA && third_byte <= 0xAE); | 162 (third_byte >= 0xAA && third_byte <= 0xAE); |
163 } | 163 } |
164 return third_byte >= 0xA6 && third_byte <= 0xA9; | 164 return third_byte >= 0xA6 && third_byte <= 0xA9; |
165 } | 165 } |
166 | 166 |
| 167 // Returns true if there is a four-byte banned char at |index|. |first_byte| is |
| 168 // the byte at |index|. |
| 169 template <typename STR> |
| 170 bool HasFourByteBannedCharAtIndex(const STR& escaped_text, |
| 171 unsigned char first_byte, |
| 172 size_t index) { |
| 173 // The following characters are blacklisted for spoofability concerns. |
| 174 // U+1F50F LOCK WITH INK PEN (%F0%9F%94%8F) |
| 175 // U+1F510 CLOSED LOCK WITH KEY (%F0%9F%94%90) |
| 176 // U+1F512 LOCK (%F0%9F%94%92) |
| 177 // U+1F513 OPEN LOCK (%F0%9F%94%93) |
| 178 if (first_byte != 0xF0) |
| 179 return false; |
| 180 |
| 181 unsigned char second_byte; |
| 182 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte) || |
| 183 second_byte != 0x9F) { |
| 184 return false; |
| 185 } |
| 186 |
| 187 unsigned char third_byte; |
| 188 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte) || |
| 189 third_byte != 0x94) { |
| 190 return false; |
| 191 } |
| 192 |
| 193 unsigned char fourth_byte; |
| 194 return UnescapeUnsignedCharAtIndex(escaped_text, index + 9, &fourth_byte) && |
| 195 (fourth_byte == 0x8F || fourth_byte == 0x90 || fourth_byte == 0x92 || |
| 196 fourth_byte == 0x93); |
| 197 } |
| 198 |
167 // Unescapes |escaped_text| according to |rules|, returning the resulting | 199 // Unescapes |escaped_text| according to |rules|, returning the resulting |
168 // string. Fills in an |adjustments| parameter, if non-NULL, so it reflects | 200 // string. Fills in an |adjustments| parameter, if non-NULL, so it reflects |
169 // the alterations done to the string that are not one-character-to-one- | 201 // the alterations done to the string that are not one-character-to-one- |
170 // character. The resulting |adjustments| will always be sorted by increasing | 202 // character. The resulting |adjustments| will always be sorted by increasing |
171 // offset. | 203 // offset. |
172 template<typename STR> | 204 template<typename STR> |
173 STR UnescapeURLWithAdjustmentsImpl( | 205 STR UnescapeURLWithAdjustmentsImpl( |
174 const STR& escaped_text, | 206 const STR& escaped_text, |
175 UnescapeRule::Type rules, | 207 UnescapeRule::Type rules, |
176 base::OffsetAdjuster::Adjustments* adjustments) { | 208 base::OffsetAdjuster::Adjustments* adjustments) { |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
210 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC | 242 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC |
211 // 3987 above has since added some new BiDi control characters. | 243 // 3987 above has since added some new BiDi control characters. |
212 // http://www.unicode.org/reports/tr9 | 244 // http://www.unicode.org/reports/tr9 |
213 // | 245 // |
214 // U+061C ARABIC LETTER MARK (%D8%9C) | 246 // U+061C ARABIC LETTER MARK (%D8%9C) |
215 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6) | 247 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6) |
216 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7) | 248 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7) |
217 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8) | 249 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8) |
218 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9) | 250 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9) |
219 // | 251 // |
| 252 // The following spoofable characters are also banned, because they could |
| 253 // be used to imitate parts of a web browser's UI. |
| 254 // |
| 255 // U+1F50F LOCK WITH INK PEN (%F0%9F%94%8F) |
| 256 // U+1F510 CLOSED LOCK WITH KEY (%F0%9F%94%90) |
| 257 // U+1F512 LOCK (%F0%9F%94%92) |
| 258 // U+1F513 OPEN LOCK (%F0%9F%94%93) |
| 259 // |
220 // However, some schemes such as data: and file: need to parse the exact | 260 // However, some schemes such as data: and file: need to parse the exact |
221 // binary data when loading the URL. For that reason, CONTROL_CHARS allows | 261 // binary data when loading the URL. For that reason, |
222 // unescaping BiDi control characters. | 262 // SPOOFING_AND_CONTROL_CHARS allows unescaping BiDi control characters. |
223 // DO NOT use CONTROL_CHARS if the parsed URL is going to be displayed | 263 // DO NOT use SPOOFING_AND_CONTROL_CHARS if the parsed URL is going to be |
224 // in the UI. | 264 // displayed in the UI. |
225 if (!(rules & UnescapeRule::CONTROL_CHARS)) { | 265 if (!(rules & UnescapeRule::SPOOFING_AND_CONTROL_CHARS)) { |
226 if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) { | 266 if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) { |
227 // Keep Arabic Language Mark escaped. | 267 // Keep Arabic Language Mark escaped. |
228 result.append(escaped_text, i, 6); | 268 result.append(escaped_text, i, 6); |
229 i += 5; | 269 i += 5; |
230 continue; | 270 continue; |
231 } | 271 } |
232 if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) { | 272 if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) { |
233 // Keep BiDi control char escaped. | 273 // Keep BiDi control char escaped. |
234 result.append(escaped_text, i, 9); | 274 result.append(escaped_text, i, 9); |
235 i += 8; | 275 i += 8; |
236 continue; | 276 continue; |
237 } | 277 } |
| 278 if (HasFourByteBannedCharAtIndex(escaped_text, first_byte, i)) { |
| 279 // Keep banned char escaped. |
| 280 result.append(escaped_text, i, 12); |
| 281 i += 11; |
| 282 continue; |
| 283 } |
238 } | 284 } |
239 | 285 |
240 if (first_byte >= 0x80 || // Unescape all high-bit characters. | 286 if (first_byte >= 0x80 || // Unescape all high-bit characters. |
241 // For 7-bit characters, the lookup table tells us all valid chars. | 287 // For 7-bit characters, the lookup table tells us all valid chars. |
242 (kUrlUnescape[first_byte] || | 288 (kUrlUnescape[first_byte] || |
243 // ...and we allow some additional unescaping when flags are set. | 289 // ...and we allow some additional unescaping when flags are set. |
244 (first_byte == ' ' && (rules & UnescapeRule::SPACES)) || | 290 (first_byte == ' ' && (rules & UnescapeRule::SPACES)) || |
245 // Allow any of the prohibited but non-control characters when | 291 // Allow any of the prohibited but non-control characters when |
246 // we're doing "special" chars. | 292 // we're doing "special" chars. |
247 (first_byte > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || | 293 (first_byte > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || |
248 // Additionally allow control characters if requested. | 294 // Additionally allow non-display characters if requested. |
249 (first_byte < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { | 295 (first_byte < ' ' && |
| 296 (rules & UnescapeRule::SPOOFING_AND_CONTROL_CHARS)))) { |
250 // Use the unescaped version of the character. | 297 // Use the unescaped version of the character. |
251 if (adjustments) | 298 if (adjustments) |
252 adjustments->push_back(base::OffsetAdjuster::Adjustment(i, 3, 1)); | 299 adjustments->push_back(base::OffsetAdjuster::Adjustment(i, 3, 1)); |
253 result.push_back(first_byte); | 300 result.push_back(first_byte); |
254 i += 2; | 301 i += 2; |
255 } else { | 302 } else { |
256 // Keep escaped. Append a percent and we'll get the following two | 303 // Keep escaped. Append a percent and we'll get the following two |
257 // digits on the next loops through. | 304 // digits on the next loops through. |
258 result.push_back('%'); | 305 result.push_back('%'); |
259 } | 306 } |
(...skipping 194 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
454 1, kEscapeToChars[i].replacement); | 501 1, kEscapeToChars[i].replacement); |
455 break; | 502 break; |
456 } | 503 } |
457 } | 504 } |
458 } | 505 } |
459 } | 506 } |
460 return text; | 507 return text; |
461 } | 508 } |
462 | 509 |
463 } // namespace net | 510 } // namespace net |
OLD | NEW |