base/string_util.cc - Issue 661205: Make IsStringUTF8 reject (U+FDD0 .. U+FDEF) ...

Side by Side Diff: base/string_util.cc

Issue 661205: Make IsStringUTF8 reject (U+FDD0 .. U+FDEF) ... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 10 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "base/string_util.h"	5 #include "base/string_util.h"

6	6

7 #include "build/build_config.h"	7 #include "build/build_config.h"

8	8

9 #include <ctype.h>	9 #include <ctype.h>

10 #include <errno.h>	10 #include <errno.h>

11 #include <math.h>	11 #include <math.h>

12 #include <stdarg.h>	12 #include <stdarg.h>

13 #include <stdio.h>	13 #include <stdio.h>

14 #include <stdlib.h>	14 #include <stdlib.h>

15 #include <string.h>	15 #include <string.h>

16 #include <time.h>	16 #include <time.h>

17 #include <wchar.h>	17 #include <wchar.h>

18 #include <wctype.h>	18 #include <wctype.h>

19	19

20 #include <algorithm>	20 #include <algorithm>

21 #include <vector>	21 #include <vector>

22	22

23 #include "base/basictypes.h"	23 #include "base/basictypes.h"

24 #include "base/logging.h"	24 #include "base/logging.h"

25 #include "base/singleton.h"	25 #include "base/singleton.h"

26 #include "base/third_party/dmg_fp/dmg_fp.h"	26 #include "base/third_party/dmg_fp/dmg_fp.h"

	27 #include "base/utf_string_conversion_utils.h"

	28 #include "base/third_party/icu/icu_utf.h"

27	29

28 namespace {	30 namespace {

29	31

30 // Force the singleton used by Empty[W]String[16] to be a unique type. This	32 // Force the singleton used by Empty[W]String[16] to be a unique type. This

31 // prevents other code that might accidentally use Singleton<string> from	33 // prevents other code that might accidentally use Singleton<string> from

32 // getting our internal one.	34 // getting our internal one.

33 struct EmptyStrings {	35 struct EmptyStrings {

34 EmptyStrings() {}	36 EmptyStrings() {}

35 const std::string s;	37 const std::string s;

36 const std::wstring ws;	38 const std::wstring ws;

(...skipping 632 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
669 #if !defined(WCHAR_T_IS_UTF16)	671 #if !defined(WCHAR_T_IS_UTF16)

670 bool IsStringASCII(const string16& str) {	672 bool IsStringASCII(const string16& str) {

671 return DoIsStringASCII(str);	673 return DoIsStringASCII(str);

672 }	674 }

673 #endif	675 #endif

674	676

675 bool IsStringASCII(const base::StringPiece& str) {	677 bool IsStringASCII(const base::StringPiece& str) {

676 return DoIsStringASCII(str);	678 return DoIsStringASCII(str);

677 }	679 }

678	680

679 // Helper functions that determine whether the given character begins a	681 bool IsStringUTF8(const std::string& str) {

680 // UTF-8 sequence of bytes with the given length. A character satisfies	682 const char *src = str.data();

681 // "IsInUTF8Sequence" if it is anything but the first byte in a multi-byte	683 int32 src_len = static_cast<int32>(str.length());

682 // character.	684 int32 char_index = 0;

683 static inline bool IsBegin2ByteUTF8(int c) {

684 return (c & 0xE0) == 0xC0;

685 }

686 static inline bool IsBegin3ByteUTF8(int c) {

687 return (c & 0xF0) == 0xE0;

688 }

689 static inline bool IsBegin4ByteUTF8(int c) {

690 return (c & 0xF8) == 0xF0;

691 }

692 static inline bool IsInUTF8Sequence(int c) {

693 return (c & 0xC0) == 0x80;

694 }

695	685

696 // This function was copied from Mozilla, with modifications. The original code	686 while (char_index < src_len) {

697 // was 'IsUTF8' in xpcom/string/src/nsReadableUtils.cpp. The license block for	687 int32 code_point;

698 // this function is:	688 CBU8_NEXT(src, char_index, src_len, code_point);

699 // This function subject to the Mozilla Public License Version	689 if (!base::IsValidCharacter(code_point))

700 // 1.1 (the "License"); you may not use this code except in compliance with	690 return false;

701 // the License. You may obtain a copy of the License at

702 // http://www.mozilla.org/MPL/

703 //

704 // Software distributed under the License is distributed on an "AS IS" basis,

705 // WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License

706 // for the specific language governing rights and limitations under the

707 // License.

708 //

709 // The Original Code is mozilla.org code.

710 //

711 // The Initial Developer of the Original Code is

712 // Netscape Communications Corporation.

713 // Portions created by the Initial Developer are Copyright (C) 2000

714 // the Initial Developer. All Rights Reserved.

715 //

716 // Contributor(s):

717 // Scott Collins <scc@mozilla.org> (original author)

718 //

719 // This is a template so that it can be run on wide and 8-bit strings. We want

720 // to run it on wide strings when we have input that we think may have

721 // originally been UTF-8, but has been converted to wide characters because

722 // that's what we (and Windows) use internally.

723 template<typename CHAR>

724 static bool IsStringUTF8T(const CHAR* str, size_t length) {

725 bool overlong = false;

726 bool surrogate = false;

727 bool nonchar = false;

728

729 // overlong byte upper bound

730 typename ToUnsigned<CHAR>::Unsigned olupper = 0;

731

732 // surrogate byte lower bound

733 typename ToUnsigned<CHAR>::Unsigned slower = 0;

734

735 // incremented when inside a multi-byte char to indicate how many bytes

736 // are left in the sequence

737 int positions_left = 0;

738

739 for (uintptr_t i = 0; i < length; i++) {

740 // This whole function assume an unsigned value so force its conversion to

741 // an unsigned value.

742 typename ToUnsigned<CHAR>::Unsigned c = str[i];

743 if (c < 0x80)

744 continue; // ASCII

745

746 if (c <= 0xC1) {

747 // [80-BF] where not expected, [C0-C1] for overlong

748 return false;

749 } else if (IsBegin2ByteUTF8(c)) {

750 positions_left = 1;

751 } else if (IsBegin3ByteUTF8(c)) {

752 positions_left = 2;

753 if (c == 0xE0) {

754 // to exclude E0[80-9F][80-BF]

755 overlong = true;

756 olupper = 0x9F;

757 } else if (c == 0xED) {

758 // ED[A0-BF][80-BF]: surrogate codepoint

759 surrogate = true;

760 slower = 0xA0;

761 } else if (c == 0xEF) {

762 // EF BF [BE-BF] : non-character

763 // TODO(jungshik): EF B7 [90-AF] should be checked as well.

764 nonchar = true;

765 }

766 } else if (c <= 0xF4) {

767 positions_left = 3;

768 nonchar = true;

769 if (c == 0xF0) {

770 // to exclude F0[80-8F][80-BF]{2}

771 overlong = true;

772 olupper = 0x8F;

773 } else if (c == 0xF4) {

774 // to exclude F4[90-BF][80-BF]

775 // actually not surrogates but codepoints beyond 0x10FFFF

776 surrogate = true;

777 slower = 0x90;

778 }

779 } else {

780 return false;

781 }

782

783 // eat the rest of this multi-byte character

784 while (positions_left) {

785 positions_left--;

786 i++;

787 c = str[i];

788 if (!c)

789 return false; // end of string but not end of character sequence

790

791 // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]

792 if (nonchar && ((!positions_left && c < 0xBE) \|\|

793 (positions_left == 1 && c != 0xBF) \|\|

794 (positions_left == 2 && 0x0F != (0x0F & c) ))) {

795 nonchar = false;

796 }

797 if (!IsInUTF8Sequence(c) \|\| (overlong && c <= olupper) \|\|

798 (surrogate && slower <= c) \|\| (nonchar && !positions_left) ) {

799 return false;

800 }

801 overlong = surrogate = false;

802 }

803 }	691 }

804 return true;	692 return true;

805 }	693 }

806	694

807 bool IsStringUTF8(const std::string& str) {

808 return IsStringUTF8T(str.data(), str.length());

809 }

810

811 bool IsStringWideUTF8(const std::wstring& str) {

812 return IsStringUTF8T(str.data(), str.length());

813 }

814

815 template<typename Iter>	695 template<typename Iter>

816 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,	696 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,

817 Iter a_end,	697 Iter a_end,

818 const char* b) {	698 const char* b) {

819 for (Iter it = a_begin; it != a_end; ++it, ++b) {	699 for (Iter it = a_begin; it != a_end; ++it, ++b) {

820 if (!b \|\| ToLowerASCII(it) != *b)	700 if (!b \|\| ToLowerASCII(it) != *b)

821 return false;	701 return false;

822 }	702 }

823 return *b == 0;	703 return *b == 0;

824 }	704 }

(...skipping 1135 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1960 // Each input byte creates two output hex characters.	1840 // Each input byte creates two output hex characters.

1961 std::string ret(size * 2, '\0');	1841 std::string ret(size * 2, '\0');

1962	1842

1963 for (size_t i = 0; i < size; ++i) {	1843 for (size_t i = 0; i < size; ++i) {

1964 char b = reinterpret_cast<const char*>(bytes)[i];	1844 char b = reinterpret_cast<const char*>(bytes)[i];

1965 ret[(i * 2)] = kHexChars[(b >> 4) & 0xf];	1845 ret[(i * 2)] = kHexChars[(b >> 4) & 0xf];

1966 ret[(i * 2) + 1] = kHexChars[b & 0xf];	1846 ret[(i * 2) + 1] = kHexChars[b & 0xf];

1967 }	1847 }

1968 return ret;	1848 return ret;

1969 }	1849 }

OLD	NEW

« no previous file with comments | « base/string_util.h ('k') | base/string_util_unittest.cc » ('j') | no next file with comments »