OLD | NEW |
1 // Copyright 2007, Google Inc. | 1 // Copyright 2007, Google Inc. |
2 // All rights reserved. | 2 // All rights reserved. |
3 // | 3 // |
4 // Redistribution and use in source and binary forms, with or without | 4 // Redistribution and use in source and binary forms, with or without |
5 // modification, are permitted provided that the following conditions are | 5 // modification, are permitted provided that the following conditions are |
6 // met: | 6 // met: |
7 // | 7 // |
8 // * Redistributions of source code must retain the above copyright | 8 // * Redistributions of source code must retain the above copyright |
9 // notice, this list of conditions and the following disclaimer. | 9 // notice, this list of conditions and the following disclaimer. |
10 // * Redistributions in binary form must reproduce the above | 10 // * Redistributions in binary form must reproduce the above |
(...skipping 15 matching lines...) Expand all Loading... |
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | 29 |
30 #include <string.h> | 30 #include <string.h> |
31 #include <vector> | 31 #include <vector> |
32 | 32 |
33 #include "googleurl/src/url_util.h" | 33 #include "googleurl/src/url_util.h" |
34 | 34 |
35 #include "base/logging.h" | 35 #include "base/logging.h" |
| 36 #include "googleurl/src/url_canon_internal.h" |
36 #include "googleurl/src/url_file.h" | 37 #include "googleurl/src/url_file.h" |
37 | 38 |
38 namespace url_util { | 39 namespace url_util { |
39 | 40 |
40 namespace { | 41 namespace { |
41 | 42 |
42 // ASCII-specific tolower. The standard library's tolower is locale sensitive, | 43 // ASCII-specific tolower. The standard library's tolower is locale sensitive, |
43 // so we don't want to use it here. | 44 // so we don't want to use it here. |
44 template <class Char> inline Char ToLowerASCII(Char c) { | 45 template <class Char> inline Char ToLowerASCII(Char c) { |
45 return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; | 46 return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; |
(...skipping 447 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
493 } | 494 } |
494 return a_begin == a_end && b_begin == b_end; | 495 return a_begin == a_end && b_begin == b_end; |
495 } | 496 } |
496 | 497 |
497 bool LowerCaseEqualsASCII(const char16* a_begin, | 498 bool LowerCaseEqualsASCII(const char16* a_begin, |
498 const char16* a_end, | 499 const char16* a_end, |
499 const char* b) { | 500 const char* b) { |
500 return DoLowerCaseEqualsASCII(a_begin, a_end, b); | 501 return DoLowerCaseEqualsASCII(a_begin, a_end, b); |
501 } | 502 } |
502 | 503 |
| 504 void DecodeURLEscapeSequences(const char* input, int length, |
| 505 url_canon::CanonOutputW* output) { |
| 506 url_canon::RawCanonOutputT<char> unescaped_chars; |
| 507 for (int i = 0; i < length; i++) { |
| 508 if (input[i] == '%') { |
| 509 unsigned char ch; |
| 510 if (url_canon::DecodeEscaped(input, &i, length, &ch)) { |
| 511 unescaped_chars.push_back(ch); |
| 512 } else { |
| 513 // Invalid escape sequence, copy the percent literal. |
| 514 unescaped_chars.push_back('%'); |
| 515 } |
| 516 } else { |
| 517 // Regular non-escaped 8-bit character. |
| 518 unescaped_chars.push_back(input[i]); |
| 519 } |
| 520 } |
| 521 |
| 522 // Convert that 8-bit to UTF-16. It's not clear IE does this at all to |
| 523 // JavaScript URLs, but Firefox and Safari do. |
| 524 for (int i = 0; i < unescaped_chars.length(); i++) { |
| 525 unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i)); |
| 526 if (uch < 0x80) { |
| 527 // Non-UTF-8, just append directly |
| 528 output->push_back(uch); |
| 529 } else { |
| 530 // next_ch will point to the last character of the decoded |
| 531 // character. |
| 532 int next_character = i; |
| 533 unsigned code_point; |
| 534 if (url_canon::ReadUTFChar(unescaped_chars.data(), &next_character, |
| 535 unescaped_chars.length(), &code_point)) { |
| 536 // Valid UTF-8 character, convert to UTF-16. |
| 537 url_canon::AppendUTF16Value(code_point, output); |
| 538 i = next_character; |
| 539 } else { |
| 540 // If there are any sequences that are not valid UTF-8, we keep |
| 541 // invalid code points and promote to UTF-16. We copy all characters |
| 542 // from the current position to the end of the identified sequence. |
| 543 while (i < next_character) { |
| 544 output->push_back(static_cast<unsigned char>(unescaped_chars.at(i))); |
| 545 i++; |
| 546 } |
| 547 output->push_back(static_cast<unsigned char>(unescaped_chars.at(i))); |
| 548 } |
| 549 } |
| 550 } |
| 551 } |
| 552 |
503 } // namespace url_util | 553 } // namespace url_util |
OLD | NEW |