OLD | NEW |
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "base/string_util.h" | 5 #include "base/string_util.h" |
6 | 6 |
7 #include "build/build_config.h" | 7 #include "build/build_config.h" |
8 | 8 |
9 #include <ctype.h> | 9 #include <ctype.h> |
10 #include <errno.h> | 10 #include <errno.h> |
11 #include <math.h> | 11 #include <math.h> |
12 #include <stdarg.h> | 12 #include <stdarg.h> |
13 #include <stdio.h> | 13 #include <stdio.h> |
14 #include <stdlib.h> | 14 #include <stdlib.h> |
15 #include <string.h> | 15 #include <string.h> |
16 #include <time.h> | 16 #include <time.h> |
17 #include <wchar.h> | 17 #include <wchar.h> |
18 #include <wctype.h> | 18 #include <wctype.h> |
19 | 19 |
20 #include <algorithm> | 20 #include <algorithm> |
21 #include <vector> | 21 #include <vector> |
22 | 22 |
23 #include "base/basictypes.h" | 23 #include "base/basictypes.h" |
24 #include "base/logging.h" | 24 #include "base/logging.h" |
25 #include "base/singleton.h" | 25 #include "base/singleton.h" |
26 #include "base/third_party/dmg_fp/dmg_fp.h" | 26 #include "base/third_party/dmg_fp/dmg_fp.h" |
| 27 #include "base/utf_string_conversion_utils.h" |
| 28 #include "base/third_party/icu/icu_utf.h" |
27 | 29 |
28 namespace { | 30 namespace { |
29 | 31 |
30 // Force the singleton used by Empty[W]String[16] to be a unique type. This | 32 // Force the singleton used by Empty[W]String[16] to be a unique type. This |
31 // prevents other code that might accidentally use Singleton<string> from | 33 // prevents other code that might accidentally use Singleton<string> from |
32 // getting our internal one. | 34 // getting our internal one. |
33 struct EmptyStrings { | 35 struct EmptyStrings { |
34 EmptyStrings() {} | 36 EmptyStrings() {} |
35 const std::string s; | 37 const std::string s; |
36 const std::wstring ws; | 38 const std::wstring ws; |
(...skipping 632 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
669 #if !defined(WCHAR_T_IS_UTF16) | 671 #if !defined(WCHAR_T_IS_UTF16) |
670 bool IsStringASCII(const string16& str) { | 672 bool IsStringASCII(const string16& str) { |
671 return DoIsStringASCII(str); | 673 return DoIsStringASCII(str); |
672 } | 674 } |
673 #endif | 675 #endif |
674 | 676 |
675 bool IsStringASCII(const base::StringPiece& str) { | 677 bool IsStringASCII(const base::StringPiece& str) { |
676 return DoIsStringASCII(str); | 678 return DoIsStringASCII(str); |
677 } | 679 } |
678 | 680 |
679 // Helper functions that determine whether the given character begins a | 681 bool IsStringUTF8(const std::string& str) { |
680 // UTF-8 sequence of bytes with the given length. A character satisfies | 682 const char *src = str.data(); |
681 // "IsInUTF8Sequence" if it is anything but the first byte in a multi-byte | 683 int32 src_len = static_cast<int32>(str.length()); |
682 // character. | 684 int32 char_index = 0; |
683 static inline bool IsBegin2ByteUTF8(int c) { | |
684 return (c & 0xE0) == 0xC0; | |
685 } | |
686 static inline bool IsBegin3ByteUTF8(int c) { | |
687 return (c & 0xF0) == 0xE0; | |
688 } | |
689 static inline bool IsBegin4ByteUTF8(int c) { | |
690 return (c & 0xF8) == 0xF0; | |
691 } | |
692 static inline bool IsInUTF8Sequence(int c) { | |
693 return (c & 0xC0) == 0x80; | |
694 } | |
695 | 685 |
696 // This function was copied from Mozilla, with modifications. The original code | 686 while (char_index < src_len) { |
697 // was 'IsUTF8' in xpcom/string/src/nsReadableUtils.cpp. The license block for | 687 int32 code_point; |
698 // this function is: | 688 CBU8_NEXT(src, char_index, src_len, code_point); |
699 // This function subject to the Mozilla Public License Version | 689 if (!base::IsValidCharacter(code_point)) |
700 // 1.1 (the "License"); you may not use this code except in compliance with | 690 return false; |
701 // the License. You may obtain a copy of the License at | |
702 // http://www.mozilla.org/MPL/ | |
703 // | |
704 // Software distributed under the License is distributed on an "AS IS" basis, | |
705 // WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License | |
706 // for the specific language governing rights and limitations under the | |
707 // License. | |
708 // | |
709 // The Original Code is mozilla.org code. | |
710 // | |
711 // The Initial Developer of the Original Code is | |
712 // Netscape Communications Corporation. | |
713 // Portions created by the Initial Developer are Copyright (C) 2000 | |
714 // the Initial Developer. All Rights Reserved. | |
715 // | |
716 // Contributor(s): | |
717 // Scott Collins <scc@mozilla.org> (original author) | |
718 // | |
719 // This is a template so that it can be run on wide and 8-bit strings. We want | |
720 // to run it on wide strings when we have input that we think may have | |
721 // originally been UTF-8, but has been converted to wide characters because | |
722 // that's what we (and Windows) use internally. | |
723 template<typename CHAR> | |
724 static bool IsStringUTF8T(const CHAR* str, size_t length) { | |
725 bool overlong = false; | |
726 bool surrogate = false; | |
727 bool nonchar = false; | |
728 | |
729 // overlong byte upper bound | |
730 typename ToUnsigned<CHAR>::Unsigned olupper = 0; | |
731 | |
732 // surrogate byte lower bound | |
733 typename ToUnsigned<CHAR>::Unsigned slower = 0; | |
734 | |
735 // incremented when inside a multi-byte char to indicate how many bytes | |
736 // are left in the sequence | |
737 int positions_left = 0; | |
738 | |
739 for (uintptr_t i = 0; i < length; i++) { | |
740 // This whole function assume an unsigned value so force its conversion to | |
741 // an unsigned value. | |
742 typename ToUnsigned<CHAR>::Unsigned c = str[i]; | |
743 if (c < 0x80) | |
744 continue; // ASCII | |
745 | |
746 if (c <= 0xC1) { | |
747 // [80-BF] where not expected, [C0-C1] for overlong | |
748 return false; | |
749 } else if (IsBegin2ByteUTF8(c)) { | |
750 positions_left = 1; | |
751 } else if (IsBegin3ByteUTF8(c)) { | |
752 positions_left = 2; | |
753 if (c == 0xE0) { | |
754 // to exclude E0[80-9F][80-BF] | |
755 overlong = true; | |
756 olupper = 0x9F; | |
757 } else if (c == 0xED) { | |
758 // ED[A0-BF][80-BF]: surrogate codepoint | |
759 surrogate = true; | |
760 slower = 0xA0; | |
761 } else if (c == 0xEF) { | |
762 // EF BF [BE-BF] : non-character | |
763 // TODO(jungshik): EF B7 [90-AF] should be checked as well. | |
764 nonchar = true; | |
765 } | |
766 } else if (c <= 0xF4) { | |
767 positions_left = 3; | |
768 nonchar = true; | |
769 if (c == 0xF0) { | |
770 // to exclude F0[80-8F][80-BF]{2} | |
771 overlong = true; | |
772 olupper = 0x8F; | |
773 } else if (c == 0xF4) { | |
774 // to exclude F4[90-BF][80-BF] | |
775 // actually not surrogates but codepoints beyond 0x10FFFF | |
776 surrogate = true; | |
777 slower = 0x90; | |
778 } | |
779 } else { | |
780 return false; | |
781 } | |
782 | |
783 // eat the rest of this multi-byte character | |
784 while (positions_left) { | |
785 positions_left--; | |
786 i++; | |
787 c = str[i]; | |
788 if (!c) | |
789 return false; // end of string but not end of character sequence | |
790 | |
791 // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF] | |
792 if (nonchar && ((!positions_left && c < 0xBE) || | |
793 (positions_left == 1 && c != 0xBF) || | |
794 (positions_left == 2 && 0x0F != (0x0F & c) ))) { | |
795 nonchar = false; | |
796 } | |
797 if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) || | |
798 (surrogate && slower <= c) || (nonchar && !positions_left) ) { | |
799 return false; | |
800 } | |
801 overlong = surrogate = false; | |
802 } | |
803 } | 691 } |
804 return true; | 692 return true; |
805 } | 693 } |
806 | 694 |
807 bool IsStringUTF8(const std::string& str) { | |
808 return IsStringUTF8T(str.data(), str.length()); | |
809 } | |
810 | |
811 bool IsStringWideUTF8(const std::wstring& str) { | |
812 return IsStringUTF8T(str.data(), str.length()); | |
813 } | |
814 | |
815 template<typename Iter> | 695 template<typename Iter> |
816 static inline bool DoLowerCaseEqualsASCII(Iter a_begin, | 696 static inline bool DoLowerCaseEqualsASCII(Iter a_begin, |
817 Iter a_end, | 697 Iter a_end, |
818 const char* b) { | 698 const char* b) { |
819 for (Iter it = a_begin; it != a_end; ++it, ++b) { | 699 for (Iter it = a_begin; it != a_end; ++it, ++b) { |
820 if (!*b || ToLowerASCII(*it) != *b) | 700 if (!*b || ToLowerASCII(*it) != *b) |
821 return false; | 701 return false; |
822 } | 702 } |
823 return *b == 0; | 703 return *b == 0; |
824 } | 704 } |
(...skipping 1135 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1960 // Each input byte creates two output hex characters. | 1840 // Each input byte creates two output hex characters. |
1961 std::string ret(size * 2, '\0'); | 1841 std::string ret(size * 2, '\0'); |
1962 | 1842 |
1963 for (size_t i = 0; i < size; ++i) { | 1843 for (size_t i = 0; i < size; ++i) { |
1964 char b = reinterpret_cast<const char*>(bytes)[i]; | 1844 char b = reinterpret_cast<const char*>(bytes)[i]; |
1965 ret[(i * 2)] = kHexChars[(b >> 4) & 0xf]; | 1845 ret[(i * 2)] = kHexChars[(b >> 4) & 0xf]; |
1966 ret[(i * 2) + 1] = kHexChars[b & 0xf]; | 1846 ret[(i * 2) + 1] = kHexChars[b & 0xf]; |
1967 } | 1847 } |
1968 return ret; | 1848 return ret; |
1969 } | 1849 } |
OLD | NEW |