OLD | NEW |
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "base/string_util.h" | 5 #include "base/string_util.h" |
6 | 6 |
7 #include "build/build_config.h" | 7 #include "build/build_config.h" |
8 | 8 |
9 #include <ctype.h> | 9 #include <ctype.h> |
10 #include <errno.h> | 10 #include <errno.h> |
11 #include <math.h> | 11 #include <math.h> |
12 #include <stdarg.h> | 12 #include <stdarg.h> |
13 #include <stdio.h> | 13 #include <stdio.h> |
14 #include <stdlib.h> | 14 #include <stdlib.h> |
15 #include <string.h> | 15 #include <string.h> |
16 #include <time.h> | 16 #include <time.h> |
17 #include <wchar.h> | 17 #include <wchar.h> |
18 #include <wctype.h> | 18 #include <wctype.h> |
19 | 19 |
20 #include <algorithm> | 20 #include <algorithm> |
21 #include <vector> | 21 #include <vector> |
22 | 22 |
23 #include "base/basictypes.h" | 23 #include "base/basictypes.h" |
24 #include "base/logging.h" | 24 #include "base/logging.h" |
25 #include "base/singleton.h" | 25 #include "base/singleton.h" |
26 #include "base/third_party/dmg_fp/dmg_fp.h" | 26 #include "base/third_party/dmg_fp/dmg_fp.h" |
27 #include "base/utf_string_conversion_utils.h" | |
28 #include "base/third_party/icu/icu_utf.h" | |
29 | 27 |
30 namespace { | 28 namespace { |
31 | 29 |
32 // Force the singleton used by Empty[W]String[16] to be a unique type. This | 30 // Force the singleton used by Empty[W]String[16] to be a unique type. This |
33 // prevents other code that might accidentally use Singleton<string> from | 31 // prevents other code that might accidentally use Singleton<string> from |
34 // getting our internal one. | 32 // getting our internal one. |
35 struct EmptyStrings { | 33 struct EmptyStrings { |
36 EmptyStrings() {} | 34 EmptyStrings() {} |
37 const std::string s; | 35 const std::string s; |
38 const std::wstring ws; | 36 const std::wstring ws; |
(...skipping 567 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
606 #if !defined(WCHAR_T_IS_UTF16) | 604 #if !defined(WCHAR_T_IS_UTF16) |
607 bool IsStringASCII(const string16& str) { | 605 bool IsStringASCII(const string16& str) { |
608 return DoIsStringASCII(str); | 606 return DoIsStringASCII(str); |
609 } | 607 } |
610 #endif | 608 #endif |
611 | 609 |
612 bool IsStringASCII(const base::StringPiece& str) { | 610 bool IsStringASCII(const base::StringPiece& str) { |
613 return DoIsStringASCII(str); | 611 return DoIsStringASCII(str); |
614 } | 612 } |
615 | 613 |
| 614 // Helper functions that determine whether the given character begins a |
| 615 // UTF-8 sequence of bytes with the given length. A character satisfies |
| 616 // "IsInUTF8Sequence" if it is anything but the first byte in a multi-byte |
| 617 // character. |
| 618 static inline bool IsBegin2ByteUTF8(int c) { |
| 619 return (c & 0xE0) == 0xC0; |
| 620 } |
| 621 static inline bool IsBegin3ByteUTF8(int c) { |
| 622 return (c & 0xF0) == 0xE0; |
| 623 } |
| 624 static inline bool IsBegin4ByteUTF8(int c) { |
| 625 return (c & 0xF8) == 0xF0; |
| 626 } |
| 627 static inline bool IsInUTF8Sequence(int c) { |
| 628 return (c & 0xC0) == 0x80; |
| 629 } |
| 630 |
| 631 // This function was copied from Mozilla, with modifications. The original code |
| 632 // was 'IsUTF8' in xpcom/string/src/nsReadableUtils.cpp. The license block for |
| 633 // this function is: |
| 634 // This function subject to the Mozilla Public License Version |
| 635 // 1.1 (the "License"); you may not use this code except in compliance with |
| 636 // the License. You may obtain a copy of the License at |
| 637 // http://www.mozilla.org/MPL/ |
| 638 // |
| 639 // Software distributed under the License is distributed on an "AS IS" basis, |
| 640 // WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
| 641 // for the specific language governing rights and limitations under the |
| 642 // License. |
| 643 // |
| 644 // The Original Code is mozilla.org code. |
| 645 // |
| 646 // The Initial Developer of the Original Code is |
| 647 // Netscape Communications Corporation. |
| 648 // Portions created by the Initial Developer are Copyright (C) 2000 |
| 649 // the Initial Developer. All Rights Reserved. |
| 650 // |
| 651 // Contributor(s): |
| 652 // Scott Collins <scc@mozilla.org> (original author) |
| 653 // |
| 654 // This is a template so that it can be run on wide and 8-bit strings. We want |
| 655 // to run it on wide strings when we have input that we think may have |
| 656 // originally been UTF-8, but has been converted to wide characters because |
| 657 // that's what we (and Windows) use internally. |
| 658 template<typename CHAR> |
| 659 static bool IsStringUTF8T(const CHAR* str, size_t length) { |
| 660 bool overlong = false; |
| 661 bool surrogate = false; |
| 662 bool nonchar = false; |
| 663 |
| 664 // overlong byte upper bound |
| 665 typename ToUnsigned<CHAR>::Unsigned olupper = 0; |
| 666 |
| 667 // surrogate byte lower bound |
| 668 typename ToUnsigned<CHAR>::Unsigned slower = 0; |
| 669 |
| 670 // incremented when inside a multi-byte char to indicate how many bytes |
| 671 // are left in the sequence |
| 672 int positions_left = 0; |
| 673 |
| 674 for (uintptr_t i = 0; i < length; i++) { |
| 675 // This whole function assume an unsigned value so force its conversion to |
| 676 // an unsigned value. |
| 677 typename ToUnsigned<CHAR>::Unsigned c = str[i]; |
| 678 if (c < 0x80) |
| 679 continue; // ASCII |
| 680 |
| 681 if (c <= 0xC1) { |
| 682 // [80-BF] where not expected, [C0-C1] for overlong |
| 683 return false; |
| 684 } else if (IsBegin2ByteUTF8(c)) { |
| 685 positions_left = 1; |
| 686 } else if (IsBegin3ByteUTF8(c)) { |
| 687 positions_left = 2; |
| 688 if (c == 0xE0) { |
| 689 // to exclude E0[80-9F][80-BF] |
| 690 overlong = true; |
| 691 olupper = 0x9F; |
| 692 } else if (c == 0xED) { |
| 693 // ED[A0-BF][80-BF]: surrogate codepoint |
| 694 surrogate = true; |
| 695 slower = 0xA0; |
| 696 } else if (c == 0xEF) { |
| 697 // EF BF [BE-BF] : non-character |
| 698 // TODO(jungshik): EF B7 [90-AF] should be checked as well. |
| 699 nonchar = true; |
| 700 } |
| 701 } else if (c <= 0xF4) { |
| 702 positions_left = 3; |
| 703 nonchar = true; |
| 704 if (c == 0xF0) { |
| 705 // to exclude F0[80-8F][80-BF]{2} |
| 706 overlong = true; |
| 707 olupper = 0x8F; |
| 708 } else if (c == 0xF4) { |
| 709 // to exclude F4[90-BF][80-BF] |
| 710 // actually not surrogates but codepoints beyond 0x10FFFF |
| 711 surrogate = true; |
| 712 slower = 0x90; |
| 713 } |
| 714 } else { |
| 715 return false; |
| 716 } |
| 717 |
| 718 // eat the rest of this multi-byte character |
| 719 while (positions_left) { |
| 720 positions_left--; |
| 721 i++; |
| 722 c = str[i]; |
| 723 if (!c) |
| 724 return false; // end of string but not end of character sequence |
| 725 |
| 726 // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF] |
| 727 if (nonchar && ((!positions_left && c < 0xBE) || |
| 728 (positions_left == 1 && c != 0xBF) || |
| 729 (positions_left == 2 && 0x0F != (0x0F & c) ))) { |
| 730 nonchar = false; |
| 731 } |
| 732 if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) || |
| 733 (surrogate && slower <= c) || (nonchar && !positions_left) ) { |
| 734 return false; |
| 735 } |
| 736 overlong = surrogate = false; |
| 737 } |
| 738 } |
| 739 return true; |
| 740 } |
| 741 |
616 bool IsStringUTF8(const std::string& str) { | 742 bool IsStringUTF8(const std::string& str) { |
617 const char *src = str.data(); | 743 return IsStringUTF8T(str.data(), str.length()); |
618 int32 src_len = static_cast<int32>(str.length()); | 744 } |
619 int32 char_index = 0; | |
620 | 745 |
621 while (char_index < src_len) { | 746 bool IsStringWideUTF8(const std::wstring& str) { |
622 int32 code_point; | 747 return IsStringUTF8T(str.data(), str.length()); |
623 CBU8_NEXT(src, char_index, src_len, code_point); | |
624 if (!base::IsValidCodepoint(code_point)) | |
625 return false; | |
626 } | |
627 | |
628 return true; | |
629 } | 748 } |
630 | 749 |
631 template<typename Iter> | 750 template<typename Iter> |
632 static inline bool DoLowerCaseEqualsASCII(Iter a_begin, | 751 static inline bool DoLowerCaseEqualsASCII(Iter a_begin, |
633 Iter a_end, | 752 Iter a_end, |
634 const char* b) { | 753 const char* b) { |
635 for (Iter it = a_begin; it != a_end; ++it, ++b) { | 754 for (Iter it = a_begin; it != a_end; ++it, ++b) { |
636 if (!*b || ToLowerASCII(*it) != *b) | 755 if (!*b || ToLowerASCII(*it) != *b) |
637 return false; | 756 return false; |
638 } | 757 } |
(...skipping 1119 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1758 // Each input byte creates two output hex characters. | 1877 // Each input byte creates two output hex characters. |
1759 std::string ret(size * 2, '\0'); | 1878 std::string ret(size * 2, '\0'); |
1760 | 1879 |
1761 for (size_t i = 0; i < size; ++i) { | 1880 for (size_t i = 0; i < size; ++i) { |
1762 char b = reinterpret_cast<const char*>(bytes)[i]; | 1881 char b = reinterpret_cast<const char*>(bytes)[i]; |
1763 ret[(i * 2)] = kHexChars[(b >> 4) & 0xf]; | 1882 ret[(i * 2)] = kHexChars[(b >> 4) & 0xf]; |
1764 ret[(i * 2) + 1] = kHexChars[b & 0xf]; | 1883 ret[(i * 2) + 1] = kHexChars[b & 0xf]; |
1765 } | 1884 } |
1766 return ret; | 1885 return ret; |
1767 } | 1886 } |
OLD | NEW |