OLD | NEW |
1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Canonicalizers for random bits that aren't big enough for their own files. | 5 // Canonicalizers for random bits that aren't big enough for their own files. |
6 | 6 |
7 #include <string.h> | 7 #include <string.h> |
8 | 8 |
9 #include "url/url_canon.h" | 9 #include "url/url_canon.h" |
10 #include "url/url_canon_internal.h" | 10 #include "url/url_canon_internal.h" |
11 | 11 |
12 namespace url { | 12 namespace url { |
13 | 13 |
14 namespace { | 14 namespace { |
15 | 15 |
16 // Returns true if the given character should be removed from the middle of a | 16 // Returns true if the given character should be removed from the middle of a |
17 // URL. | 17 // URL. |
18 inline bool IsRemovableURLWhitespace(int ch) { | 18 inline bool IsRemovableURLWhitespace(int ch) { |
19 return ch == '\r' || ch == '\n' || ch == '\t'; | 19 return ch == '\r' || ch == '\n' || ch == '\t'; |
20 } | 20 } |
21 | 21 |
22 // Backend for RemoveURLWhitespace (see declaration in url_canon.h). | 22 // Backend for RemoveURLWhitespace (see declaration in url_canon.h). |
23 // It sucks that we have to do this, since this takes about 13% of the total URL | 23 // It sucks that we have to do this, since this takes about 13% of the total URL |
24 // canonicalization time. | 24 // canonicalization time. |
25 template<typename CHAR> | 25 template <typename CHAR> |
26 const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len, | 26 const CHAR* DoRemoveURLWhitespace(const CHAR* input, |
| 27 int input_len, |
27 CanonOutputT<CHAR>* buffer, | 28 CanonOutputT<CHAR>* buffer, |
28 int* output_len) { | 29 int* output_len, |
| 30 bool* potentially_dangling_markup) { |
29 // Fast verification that there's nothing that needs removal. This is the 99% | 31 // Fast verification that there's nothing that needs removal. This is the 99% |
30 // case, so we want it to be fast and don't care about impacting the speed | 32 // case, so we want it to be fast and don't care about impacting the speed |
31 // when we do find whitespace. | 33 // when we do find whitespace. |
32 int found_whitespace = false; | 34 int found_whitespace = false; |
33 for (int i = 0; i < input_len; i++) { | 35 for (int i = 0; i < input_len; i++) { |
34 if (!IsRemovableURLWhitespace(input[i])) | 36 if (!IsRemovableURLWhitespace(input[i])) |
35 continue; | 37 continue; |
36 found_whitespace = true; | 38 found_whitespace = true; |
37 break; | 39 break; |
38 } | 40 } |
39 | 41 |
40 if (!found_whitespace) { | 42 if (!found_whitespace) { |
41 // Didn't find any whitespace, we don't need to do anything. We can just | 43 // Didn't find any whitespace, we don't need to do anything. We can just |
42 // return the input as the output. | 44 // return the input as the output. |
43 *output_len = input_len; | 45 *output_len = input_len; |
44 return input; | 46 return input; |
45 } | 47 } |
46 | 48 |
47 // Remove the whitespace into the new buffer and return it. | 49 // Remove the whitespace into the new buffer and return it. |
48 for (int i = 0; i < input_len; i++) { | 50 for (int i = 0; i < input_len; i++) { |
49 if (!IsRemovableURLWhitespace(input[i])) | 51 if (!IsRemovableURLWhitespace(input[i])) { |
| 52 if (potentially_dangling_markup && input[i] == 0x3C) |
| 53 *potentially_dangling_markup = true; |
50 buffer->push_back(input[i]); | 54 buffer->push_back(input[i]); |
| 55 } |
51 } | 56 } |
52 *output_len = buffer->length(); | 57 *output_len = buffer->length(); |
53 return buffer->data(); | 58 return buffer->data(); |
54 } | 59 } |
55 | 60 |
56 // Contains the canonical version of each possible input letter in the scheme | 61 // Contains the canonical version of each possible input letter in the scheme |
57 // (basically, lower-cased). The corresponding entry will be 0 if the letter | 62 // (basically, lower-cased). The corresponding entry will be 0 if the letter |
58 // is not allowed in a scheme. | 63 // is not allowed in a scheme. |
59 const char kSchemeCanonical[0x80] = { | 64 const char kSchemeCanonical[0x80] = { |
60 // 00-1f: all are invalid | 65 // 00-1f: all are invalid |
(...skipping 206 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
267 ReadUTFChar(spec, &i, end, &code_point); | 272 ReadUTFChar(spec, &i, end, &code_point); |
268 AppendUTF8Value(code_point, output); | 273 AppendUTF8Value(code_point, output); |
269 } | 274 } |
270 } | 275 } |
271 | 276 |
272 out_ref->len = output->length() - out_ref->begin; | 277 out_ref->len = output->length() - out_ref->begin; |
273 } | 278 } |
274 | 279 |
275 } // namespace | 280 } // namespace |
276 | 281 |
277 const char* RemoveURLWhitespace(const char* input, int input_len, | 282 const char* RemoveURLWhitespace(const char* input, |
| 283 int input_len, |
278 CanonOutputT<char>* buffer, | 284 CanonOutputT<char>* buffer, |
279 int* output_len) { | 285 int* output_len, |
280 return DoRemoveURLWhitespace(input, input_len, buffer, output_len); | 286 bool* potentially_dangling_markup) { |
| 287 return DoRemoveURLWhitespace(input, input_len, buffer, output_len, |
| 288 potentially_dangling_markup); |
281 } | 289 } |
282 | 290 |
283 const base::char16* RemoveURLWhitespace(const base::char16* input, | 291 const base::char16* RemoveURLWhitespace(const base::char16* input, |
284 int input_len, | 292 int input_len, |
285 CanonOutputT<base::char16>* buffer, | 293 CanonOutputT<base::char16>* buffer, |
286 int* output_len) { | 294 int* output_len, |
287 return DoRemoveURLWhitespace(input, input_len, buffer, output_len); | 295 bool* potentially_dangling_markup) { |
| 296 return DoRemoveURLWhitespace(input, input_len, buffer, output_len, |
| 297 potentially_dangling_markup); |
288 } | 298 } |
289 | 299 |
290 char CanonicalSchemeChar(base::char16 ch) { | 300 char CanonicalSchemeChar(base::char16 ch) { |
291 if (ch >= 0x80) | 301 if (ch >= 0x80) |
292 return 0; // Non-ASCII is not supported by schemes. | 302 return 0; // Non-ASCII is not supported by schemes. |
293 return kSchemeCanonical[ch]; | 303 return kSchemeCanonical[ch]; |
294 } | 304 } |
295 | 305 |
296 bool CanonicalizeScheme(const char* spec, | 306 bool CanonicalizeScheme(const char* spec, |
297 const Component& scheme, | 307 const Component& scheme, |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
358 } | 368 } |
359 | 369 |
360 void CanonicalizeRef(const base::char16* spec, | 370 void CanonicalizeRef(const base::char16* spec, |
361 const Component& ref, | 371 const Component& ref, |
362 CanonOutput* output, | 372 CanonOutput* output, |
363 Component* out_ref) { | 373 Component* out_ref) { |
364 DoCanonicalizeRef<base::char16, base::char16>(spec, ref, output, out_ref); | 374 DoCanonicalizeRef<base::char16, base::char16>(spec, ref, output, out_ref); |
365 } | 375 } |
366 | 376 |
367 } // namespace url | 377 } // namespace url |
OLD | NEW |