base/string_util.cc - Issue 552026: Revert 36459 - Breaks 7 WebKit tests...

Side by Side Diff: base/string_util.cc

Issue 552026: Revert 36459 - Breaks 7 WebKit tests... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: Created 10 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "base/string_util.h"	5 #include "base/string_util.h"

6	6

7 #include "build/build_config.h"	7 #include "build/build_config.h"

8	8

9 #include <ctype.h>	9 #include <ctype.h>

10 #include <errno.h>	10 #include <errno.h>

11 #include <math.h>	11 #include <math.h>

12 #include <stdarg.h>	12 #include <stdarg.h>

13 #include <stdio.h>	13 #include <stdio.h>

14 #include <stdlib.h>	14 #include <stdlib.h>

15 #include <string.h>	15 #include <string.h>

16 #include <time.h>	16 #include <time.h>

17 #include <wchar.h>	17 #include <wchar.h>

18 #include <wctype.h>	18 #include <wctype.h>

19	19

20 #include <algorithm>	20 #include <algorithm>

21 #include <vector>	21 #include <vector>

22	22

23 #include "base/basictypes.h"	23 #include "base/basictypes.h"

24 #include "base/logging.h"	24 #include "base/logging.h"

25 #include "base/singleton.h"	25 #include "base/singleton.h"

26 #include "base/third_party/dmg_fp/dmg_fp.h"	26 #include "base/third_party/dmg_fp/dmg_fp.h"

27 #include "base/utf_string_conversion_utils.h"

28 #include "base/third_party/icu/icu_utf.h"

29	27

30 namespace {	28 namespace {

31	29

32 // Force the singleton used by Empty[W]String[16] to be a unique type. This	30 // Force the singleton used by Empty[W]String[16] to be a unique type. This

33 // prevents other code that might accidentally use Singleton<string> from	31 // prevents other code that might accidentally use Singleton<string> from

34 // getting our internal one.	32 // getting our internal one.

35 struct EmptyStrings {	33 struct EmptyStrings {

36 EmptyStrings() {}	34 EmptyStrings() {}

37 const std::string s;	35 const std::string s;

38 const std::wstring ws;	36 const std::wstring ws;

(...skipping 567 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
606 #if !defined(WCHAR_T_IS_UTF16)	604 #if !defined(WCHAR_T_IS_UTF16)

607 bool IsStringASCII(const string16& str) {	605 bool IsStringASCII(const string16& str) {

608 return DoIsStringASCII(str);	606 return DoIsStringASCII(str);

609 }	607 }

610 #endif	608 #endif

611	609

612 bool IsStringASCII(const base::StringPiece& str) {	610 bool IsStringASCII(const base::StringPiece& str) {

613 return DoIsStringASCII(str);	611 return DoIsStringASCII(str);

614 }	612 }

615	613

	614 // Helper functions that determine whether the given character begins a

	615 // UTF-8 sequence of bytes with the given length. A character satisfies

	616 // "IsInUTF8Sequence" if it is anything but the first byte in a multi-byte

	617 // character.

	618 static inline bool IsBegin2ByteUTF8(int c) {

	619 return (c & 0xE0) == 0xC0;

	620 }

	621 static inline bool IsBegin3ByteUTF8(int c) {

	622 return (c & 0xF0) == 0xE0;

	623 }

	624 static inline bool IsBegin4ByteUTF8(int c) {

	625 return (c & 0xF8) == 0xF0;

	626 }

	627 static inline bool IsInUTF8Sequence(int c) {

	628 return (c & 0xC0) == 0x80;

	629 }

	630

	631 // This function was copied from Mozilla, with modifications. The original code

	632 // was 'IsUTF8' in xpcom/string/src/nsReadableUtils.cpp. The license block for

	633 // this function is:

	634 // This function subject to the Mozilla Public License Version

	635 // 1.1 (the "License"); you may not use this code except in compliance with

	636 // the License. You may obtain a copy of the License at

	637 // http://www.mozilla.org/MPL/

	638 //

	639 // Software distributed under the License is distributed on an "AS IS" basis,

	640 // WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License

	641 // for the specific language governing rights and limitations under the

	642 // License.

	643 //

	644 // The Original Code is mozilla.org code.

	645 //

	646 // The Initial Developer of the Original Code is

	647 // Netscape Communications Corporation.

	648 // Portions created by the Initial Developer are Copyright (C) 2000

	649 // the Initial Developer. All Rights Reserved.

	650 //

	651 // Contributor(s):

	652 // Scott Collins <scc@mozilla.org> (original author)

	653 //

	654 // This is a template so that it can be run on wide and 8-bit strings. We want

	655 // to run it on wide strings when we have input that we think may have

	656 // originally been UTF-8, but has been converted to wide characters because

	657 // that's what we (and Windows) use internally.

	658 template<typename CHAR>

	659 static bool IsStringUTF8T(const CHAR* str, size_t length) {

	660 bool overlong = false;

	661 bool surrogate = false;

	662 bool nonchar = false;

	663

	664 // overlong byte upper bound

	665 typename ToUnsigned<CHAR>::Unsigned olupper = 0;

	666

	667 // surrogate byte lower bound

	668 typename ToUnsigned<CHAR>::Unsigned slower = 0;

	669

	670 // incremented when inside a multi-byte char to indicate how many bytes

	671 // are left in the sequence

	672 int positions_left = 0;

	673

	674 for (uintptr_t i = 0; i < length; i++) {

	675 // This whole function assume an unsigned value so force its conversion to

	676 // an unsigned value.

	677 typename ToUnsigned<CHAR>::Unsigned c = str[i];

	678 if (c < 0x80)

	679 continue; // ASCII

	680

	681 if (c <= 0xC1) {

	682 // [80-BF] where not expected, [C0-C1] for overlong

	683 return false;

	684 } else if (IsBegin2ByteUTF8(c)) {

	685 positions_left = 1;

	686 } else if (IsBegin3ByteUTF8(c)) {

	687 positions_left = 2;

	688 if (c == 0xE0) {

	689 // to exclude E0[80-9F][80-BF]

	690 overlong = true;

	691 olupper = 0x9F;

	692 } else if (c == 0xED) {

	693 // ED[A0-BF][80-BF]: surrogate codepoint

	694 surrogate = true;

	695 slower = 0xA0;

	696 } else if (c == 0xEF) {

	697 // EF BF [BE-BF] : non-character

	698 // TODO(jungshik): EF B7 [90-AF] should be checked as well.

	699 nonchar = true;

	700 }

	701 } else if (c <= 0xF4) {

	702 positions_left = 3;

	703 nonchar = true;

	704 if (c == 0xF0) {

	705 // to exclude F0[80-8F][80-BF]{2}

	706 overlong = true;

	707 olupper = 0x8F;

	708 } else if (c == 0xF4) {

	709 // to exclude F4[90-BF][80-BF]

	710 // actually not surrogates but codepoints beyond 0x10FFFF

	711 surrogate = true;

	712 slower = 0x90;

	713 }

	714 } else {

	715 return false;

	716 }

	717

	718 // eat the rest of this multi-byte character

	719 while (positions_left) {

	720 positions_left--;

	721 i++;

	722 c = str[i];

	723 if (!c)

	724 return false; // end of string but not end of character sequence

	725

	726 // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]

	727 if (nonchar && ((!positions_left && c < 0xBE) \|\|

	728 (positions_left == 1 && c != 0xBF) \|\|

	729 (positions_left == 2 && 0x0F != (0x0F & c) ))) {

	730 nonchar = false;

	731 }

	732 if (!IsInUTF8Sequence(c) \|\| (overlong && c <= olupper) \|\|

	733 (surrogate && slower <= c) \|\| (nonchar && !positions_left) ) {

	734 return false;

	735 }

	736 overlong = surrogate = false;

	737 }

	738 }

	739 return true;

	740 }

	741

616 bool IsStringUTF8(const std::string& str) {	742 bool IsStringUTF8(const std::string& str) {

617 const char *src = str.data();	743 return IsStringUTF8T(str.data(), str.length());

618 int32 src_len = static_cast<int32>(str.length());	744 }

619 int32 char_index = 0;

620	745

621 while (char_index < src_len) {	746 bool IsStringWideUTF8(const std::wstring& str) {

622 int32 code_point;	747 return IsStringUTF8T(str.data(), str.length());

623 CBU8_NEXT(src, char_index, src_len, code_point);

624 if (!base::IsValidCodepoint(code_point))

625 return false;

626 }

627

628 return true;

629 }	748 }

630	749

631 template<typename Iter>	750 template<typename Iter>

632 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,	751 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,

633 Iter a_end,	752 Iter a_end,

634 const char* b) {	753 const char* b) {

635 for (Iter it = a_begin; it != a_end; ++it, ++b) {	754 for (Iter it = a_begin; it != a_end; ++it, ++b) {

636 if (!b \|\| ToLowerASCII(it) != *b)	755 if (!b \|\| ToLowerASCII(it) != *b)

637 return false;	756 return false;

638 }	757 }

(...skipping 1119 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1758 // Each input byte creates two output hex characters.	1877 // Each input byte creates two output hex characters.

1759 std::string ret(size * 2, '\0');	1878 std::string ret(size * 2, '\0');

1760	1879

1761 for (size_t i = 0; i < size; ++i) {	1880 for (size_t i = 0; i < size; ++i) {

1762 char b = reinterpret_cast<const char*>(bytes)[i];	1881 char b = reinterpret_cast<const char*>(bytes)[i];

1763 ret[(i * 2)] = kHexChars[(b >> 4) & 0xf];	1882 ret[(i * 2)] = kHexChars[(b >> 4) & 0xf];

1764 ret[(i * 2) + 1] = kHexChars[b & 0xf];	1883 ret[(i * 2) + 1] = kHexChars[b & 0xf];

1765 }	1884 }

1766 return ret;	1885 return ret;

1767 }	1886 }

OLD	NEW

« no previous file with comments | « base/string_util.h ('k') | base/string_util_unittest.cc » ('j') | no next file with comments »