Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(373)

Side by Side Diff: base/string_util.cc

Issue 661205: Make IsStringUTF8 reject (U+FDD0 .. U+FDEF) ... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 10 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « base/string_util.h ('k') | base/string_util_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "base/string_util.h" 5 #include "base/string_util.h"
6 6
7 #include "build/build_config.h" 7 #include "build/build_config.h"
8 8
9 #include <ctype.h> 9 #include <ctype.h>
10 #include <errno.h> 10 #include <errno.h>
11 #include <math.h> 11 #include <math.h>
12 #include <stdarg.h> 12 #include <stdarg.h>
13 #include <stdio.h> 13 #include <stdio.h>
14 #include <stdlib.h> 14 #include <stdlib.h>
15 #include <string.h> 15 #include <string.h>
16 #include <time.h> 16 #include <time.h>
17 #include <wchar.h> 17 #include <wchar.h>
18 #include <wctype.h> 18 #include <wctype.h>
19 19
20 #include <algorithm> 20 #include <algorithm>
21 #include <vector> 21 #include <vector>
22 22
23 #include "base/basictypes.h" 23 #include "base/basictypes.h"
24 #include "base/logging.h" 24 #include "base/logging.h"
25 #include "base/singleton.h" 25 #include "base/singleton.h"
26 #include "base/third_party/dmg_fp/dmg_fp.h" 26 #include "base/third_party/dmg_fp/dmg_fp.h"
27 #include "base/utf_string_conversion_utils.h"
28 #include "base/third_party/icu/icu_utf.h"
27 29
28 namespace { 30 namespace {
29 31
30 // Force the singleton used by Empty[W]String[16] to be a unique type. This 32 // Force the singleton used by Empty[W]String[16] to be a unique type. This
31 // prevents other code that might accidentally use Singleton<string> from 33 // prevents other code that might accidentally use Singleton<string> from
32 // getting our internal one. 34 // getting our internal one.
33 struct EmptyStrings { 35 struct EmptyStrings {
34 EmptyStrings() {} 36 EmptyStrings() {}
35 const std::string s; 37 const std::string s;
36 const std::wstring ws; 38 const std::wstring ws;
(...skipping 632 matching lines...) Expand 10 before | Expand all | Expand 10 after
669 #if !defined(WCHAR_T_IS_UTF16) 671 #if !defined(WCHAR_T_IS_UTF16)
670 bool IsStringASCII(const string16& str) { 672 bool IsStringASCII(const string16& str) {
671 return DoIsStringASCII(str); 673 return DoIsStringASCII(str);
672 } 674 }
673 #endif 675 #endif
674 676
675 bool IsStringASCII(const base::StringPiece& str) { 677 bool IsStringASCII(const base::StringPiece& str) {
676 return DoIsStringASCII(str); 678 return DoIsStringASCII(str);
677 } 679 }
678 680
679 // Helper functions that determine whether the given character begins a 681 bool IsStringUTF8(const std::string& str) {
680 // UTF-8 sequence of bytes with the given length. A character satisfies 682 const char *src = str.data();
681 // "IsInUTF8Sequence" if it is anything but the first byte in a multi-byte 683 int32 src_len = static_cast<int32>(str.length());
682 // character. 684 int32 char_index = 0;
683 static inline bool IsBegin2ByteUTF8(int c) {
684 return (c & 0xE0) == 0xC0;
685 }
686 static inline bool IsBegin3ByteUTF8(int c) {
687 return (c & 0xF0) == 0xE0;
688 }
689 static inline bool IsBegin4ByteUTF8(int c) {
690 return (c & 0xF8) == 0xF0;
691 }
692 static inline bool IsInUTF8Sequence(int c) {
693 return (c & 0xC0) == 0x80;
694 }
695 685
696 // This function was copied from Mozilla, with modifications. The original code 686 while (char_index < src_len) {
697 // was 'IsUTF8' in xpcom/string/src/nsReadableUtils.cpp. The license block for 687 int32 code_point;
698 // this function is: 688 CBU8_NEXT(src, char_index, src_len, code_point);
699 // This function subject to the Mozilla Public License Version 689 if (!base::IsValidCharacter(code_point))
700 // 1.1 (the "License"); you may not use this code except in compliance with 690 return false;
701 // the License. You may obtain a copy of the License at
702 // http://www.mozilla.org/MPL/
703 //
704 // Software distributed under the License is distributed on an "AS IS" basis,
705 // WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
706 // for the specific language governing rights and limitations under the
707 // License.
708 //
709 // The Original Code is mozilla.org code.
710 //
711 // The Initial Developer of the Original Code is
712 // Netscape Communications Corporation.
713 // Portions created by the Initial Developer are Copyright (C) 2000
714 // the Initial Developer. All Rights Reserved.
715 //
716 // Contributor(s):
717 // Scott Collins <scc@mozilla.org> (original author)
718 //
719 // This is a template so that it can be run on wide and 8-bit strings. We want
720 // to run it on wide strings when we have input that we think may have
721 // originally been UTF-8, but has been converted to wide characters because
722 // that's what we (and Windows) use internally.
723 template<typename CHAR>
724 static bool IsStringUTF8T(const CHAR* str, size_t length) {
725 bool overlong = false;
726 bool surrogate = false;
727 bool nonchar = false;
728
729 // overlong byte upper bound
730 typename ToUnsigned<CHAR>::Unsigned olupper = 0;
731
732 // surrogate byte lower bound
733 typename ToUnsigned<CHAR>::Unsigned slower = 0;
734
735 // incremented when inside a multi-byte char to indicate how many bytes
736 // are left in the sequence
737 int positions_left = 0;
738
739 for (uintptr_t i = 0; i < length; i++) {
740 // This whole function assume an unsigned value so force its conversion to
741 // an unsigned value.
742 typename ToUnsigned<CHAR>::Unsigned c = str[i];
743 if (c < 0x80)
744 continue; // ASCII
745
746 if (c <= 0xC1) {
747 // [80-BF] where not expected, [C0-C1] for overlong
748 return false;
749 } else if (IsBegin2ByteUTF8(c)) {
750 positions_left = 1;
751 } else if (IsBegin3ByteUTF8(c)) {
752 positions_left = 2;
753 if (c == 0xE0) {
754 // to exclude E0[80-9F][80-BF]
755 overlong = true;
756 olupper = 0x9F;
757 } else if (c == 0xED) {
758 // ED[A0-BF][80-BF]: surrogate codepoint
759 surrogate = true;
760 slower = 0xA0;
761 } else if (c == 0xEF) {
762 // EF BF [BE-BF] : non-character
763 // TODO(jungshik): EF B7 [90-AF] should be checked as well.
764 nonchar = true;
765 }
766 } else if (c <= 0xF4) {
767 positions_left = 3;
768 nonchar = true;
769 if (c == 0xF0) {
770 // to exclude F0[80-8F][80-BF]{2}
771 overlong = true;
772 olupper = 0x8F;
773 } else if (c == 0xF4) {
774 // to exclude F4[90-BF][80-BF]
775 // actually not surrogates but codepoints beyond 0x10FFFF
776 surrogate = true;
777 slower = 0x90;
778 }
779 } else {
780 return false;
781 }
782
783 // eat the rest of this multi-byte character
784 while (positions_left) {
785 positions_left--;
786 i++;
787 c = str[i];
788 if (!c)
789 return false; // end of string but not end of character sequence
790
791 // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]
792 if (nonchar && ((!positions_left && c < 0xBE) ||
793 (positions_left == 1 && c != 0xBF) ||
794 (positions_left == 2 && 0x0F != (0x0F & c) ))) {
795 nonchar = false;
796 }
797 if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) ||
798 (surrogate && slower <= c) || (nonchar && !positions_left) ) {
799 return false;
800 }
801 overlong = surrogate = false;
802 }
803 } 691 }
804 return true; 692 return true;
805 } 693 }
806 694
807 bool IsStringUTF8(const std::string& str) {
808 return IsStringUTF8T(str.data(), str.length());
809 }
810
811 bool IsStringWideUTF8(const std::wstring& str) {
812 return IsStringUTF8T(str.data(), str.length());
813 }
814
815 template<typename Iter> 695 template<typename Iter>
816 static inline bool DoLowerCaseEqualsASCII(Iter a_begin, 696 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
817 Iter a_end, 697 Iter a_end,
818 const char* b) { 698 const char* b) {
819 for (Iter it = a_begin; it != a_end; ++it, ++b) { 699 for (Iter it = a_begin; it != a_end; ++it, ++b) {
820 if (!*b || ToLowerASCII(*it) != *b) 700 if (!*b || ToLowerASCII(*it) != *b)
821 return false; 701 return false;
822 } 702 }
823 return *b == 0; 703 return *b == 0;
824 } 704 }
(...skipping 1135 matching lines...) Expand 10 before | Expand all | Expand 10 after
1960 // Each input byte creates two output hex characters. 1840 // Each input byte creates two output hex characters.
1961 std::string ret(size * 2, '\0'); 1841 std::string ret(size * 2, '\0');
1962 1842
1963 for (size_t i = 0; i < size; ++i) { 1843 for (size_t i = 0; i < size; ++i) {
1964 char b = reinterpret_cast<const char*>(bytes)[i]; 1844 char b = reinterpret_cast<const char*>(bytes)[i];
1965 ret[(i * 2)] = kHexChars[(b >> 4) & 0xf]; 1845 ret[(i * 2)] = kHexChars[(b >> 4) & 0xf];
1966 ret[(i * 2) + 1] = kHexChars[b & 0xf]; 1846 ret[(i * 2) + 1] = kHexChars[b & 0xf];
1967 } 1847 }
1968 return ret; 1848 return ret;
1969 } 1849 }
OLDNEW
« no previous file with comments | « base/string_util.h ('k') | base/string_util_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698