OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "net/base/escape.h" | 5 #include "net/base/escape.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 | 8 |
9 #include "base/logging.h" | 9 #include "base/logging.h" |
10 #include "base/memory/scoped_ptr.h" | 10 #include "base/memory/scoped_ptr.h" |
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
113 const typename STR::value_type least_sig_digit( | 113 const typename STR::value_type least_sig_digit( |
114 static_cast<typename STR::value_type>(escaped_text[index + 2])); | 114 static_cast<typename STR::value_type>(escaped_text[index + 2])); |
115 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { | 115 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { |
116 *value = HexDigitToInt(most_sig_digit) * 16 + | 116 *value = HexDigitToInt(most_sig_digit) * 16 + |
117 HexDigitToInt(least_sig_digit); | 117 HexDigitToInt(least_sig_digit); |
118 return true; | 118 return true; |
119 } | 119 } |
120 return false; | 120 return false; |
121 } | 121 } |
122 | 122 |
123 // Returns true if there is an Arabic Language Mark at |index|. |first_byte| | |
124 // is the byte at |index|. | |
125 template<typename STR> | |
126 bool HasArabicLanguageMarkAtIndex(const STR& escaped_text, | |
127 unsigned char first_byte, | |
128 size_t index) { | |
129 if (first_byte != 0xD8) | |
130 return false; | |
131 unsigned char second_byte; | |
132 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte)) | |
133 return false; | |
134 return second_byte == 0x9c; | |
135 } | |
136 | |
137 // Returns true if second and third bytes are of a three byte BiDi control | |
138 // character sequence. | |
139 bool IsLastTwoBytesofThreeByteBidiControlChar(unsigned char second_byte, | |
140 unsigned char third_byte) { | |
mmenke
2014/10/20 15:46:40
This function name is a lie. Its actually IsLastT
meacer
2014/10/20 17:23:26
Sounds reasonable, inlined.
| |
141 if (second_byte == 0x80) { | |
142 return third_byte == 0x8E || | |
143 third_byte == 0x8F || | |
144 (third_byte >= 0xAA && third_byte <= 0xAE); | |
145 } | |
mmenke
2014/10/20 15:46:40
DCHECK_EQ(0x81, second_byte)? If this is a separa
meacer
2014/10/20 17:23:26
Done.
| |
146 return third_byte >= 0xA6 && third_byte <= 0xA9; | |
147 } | |
148 | |
149 // Returns true if there is a BiDi control char at |index|. |first_byte| is the | |
150 // byte at |index|. | |
151 template<typename STR> | |
152 bool HasThreeByteBidiControlCharAtIndex(const STR& escaped_text, | |
153 unsigned char first_byte, | |
154 size_t index) { | |
155 if (first_byte != 0xE2) | |
156 return false; | |
157 unsigned char second_byte; | |
158 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte)) | |
159 return false; | |
160 if (second_byte != 0x80 && second_byte != 0x81) | |
161 return false; | |
162 unsigned char third_byte; | |
163 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte)) | |
164 return false; | |
165 return IsLastTwoBytesofThreeByteBidiControlChar(second_byte, third_byte); | |
166 } | |
167 | |
123 // Unescapes |escaped_text| according to |rules|, returning the resulting | 168 // Unescapes |escaped_text| according to |rules|, returning the resulting |
124 // string. Fills in an |adjustments| parameter, if non-NULL, so it reflects | 169 // string. Fills in an |adjustments| parameter, if non-NULL, so it reflects |
125 // the alterations done to the string that are not one-character-to-one- | 170 // the alterations done to the string that are not one-character-to-one- |
126 // character. The resulting |adjustments| will always be sorted by increasing | 171 // character. The resulting |adjustments| will always be sorted by increasing |
127 // offset. | 172 // offset. |
128 template<typename STR> | 173 template<typename STR> |
129 STR UnescapeURLWithAdjustmentsImpl( | 174 STR UnescapeURLWithAdjustmentsImpl( |
130 const STR& escaped_text, | 175 const STR& escaped_text, |
131 UnescapeRule::Type rules, | 176 UnescapeRule::Type rules, |
132 base::OffsetAdjuster::Adjustments* adjustments) { | 177 base::OffsetAdjuster::Adjustments* adjustments) { |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
165 // | 210 // |
166 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC | 211 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC |
167 // 3987 above has since added some new BiDi control characters. | 212 // 3987 above has since added some new BiDi control characters. |
168 // http://www.unicode.org/reports/tr9 | 213 // http://www.unicode.org/reports/tr9 |
169 // | 214 // |
170 // U+061C ARABIC LETTER MARK (%D8%9C) | 215 // U+061C ARABIC LETTER MARK (%D8%9C) |
171 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6) | 216 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6) |
172 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7) | 217 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7) |
173 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8) | 218 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8) |
174 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9) | 219 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9) |
175 | 220 // |
176 unsigned char second_byte; | 221 // However, escaping these characters in data: urls result in |
177 // Check for ALM. | 222 // escaped BiDi control characters being displayed in the rendered html, |
178 if ((first_byte == 0xD8) && | 223 // so the parsing for data: urls is allowed to force unescaping of these |
179 UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) && | 224 // characters. DO NOT use BIDI_CONTROL_CHARS flag without talking to a |
180 (second_byte == 0x9c)) { | 225 // security person. |
181 result.append(escaped_text, i, 6); | 226 if (!(rules & UnescapeRule::BIDI_CONTROL_CHARS)) { |
182 i += 5; | 227 if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) { |
183 continue; | 228 result.append(escaped_text, i, 6); |
184 } | 229 i += 5; |
185 | 230 continue; |
186 // Check for other BiDi control characters. | 231 } |
187 if ((first_byte == 0xE2) && | 232 if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) { |
188 UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) && | |
189 ((second_byte == 0x80) || (second_byte == 0x81))) { | |
190 unsigned char third_byte; | |
191 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &third_byte) && | |
192 ((second_byte == 0x80) ? | |
193 ((third_byte == 0x8E) || (third_byte == 0x8F) || | |
194 ((third_byte >= 0xAA) && (third_byte <= 0xAE))) : | |
195 ((third_byte >= 0xA6) && (third_byte <= 0xA9)))) { | |
196 result.append(escaped_text, i, 9); | 233 result.append(escaped_text, i, 9); |
197 i += 8; | 234 i += 8; |
198 continue; | 235 continue; |
199 } | 236 } |
200 } | 237 } |
201 | 238 |
202 if (first_byte >= 0x80 || // Unescape all high-bit characters. | 239 if (first_byte >= 0x80 || // Unescape all high-bit characters. |
203 // For 7-bit characters, the lookup table tells us all valid chars. | 240 // For 7-bit characters, the lookup table tells us all valid chars. |
204 (kUrlUnescape[first_byte] || | 241 (kUrlUnescape[first_byte] || |
205 // ...and we allow some additional unescaping when flags are set. | 242 // ...and we allow some additional unescaping when flags are set. |
(...skipping 196 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
402 1, kEscapeToChars[i].replacement); | 439 1, kEscapeToChars[i].replacement); |
403 break; | 440 break; |
404 } | 441 } |
405 } | 442 } |
406 } | 443 } |
407 } | 444 } |
408 return text; | 445 return text; |
409 } | 446 } |
410 | 447 |
411 } // namespace net | 448 } // namespace net |
OLD | NEW |