Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(90)

Side by Side Diff: base/string_util.cc

Issue 552026: Revert 36459 - Breaks 7 WebKit tests... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: Created 10 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « base/string_util.h ('k') | base/string_util_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "base/string_util.h" 5 #include "base/string_util.h"
6 6
7 #include "build/build_config.h" 7 #include "build/build_config.h"
8 8
9 #include <ctype.h> 9 #include <ctype.h>
10 #include <errno.h> 10 #include <errno.h>
11 #include <math.h> 11 #include <math.h>
12 #include <stdarg.h> 12 #include <stdarg.h>
13 #include <stdio.h> 13 #include <stdio.h>
14 #include <stdlib.h> 14 #include <stdlib.h>
15 #include <string.h> 15 #include <string.h>
16 #include <time.h> 16 #include <time.h>
17 #include <wchar.h> 17 #include <wchar.h>
18 #include <wctype.h> 18 #include <wctype.h>
19 19
20 #include <algorithm> 20 #include <algorithm>
21 #include <vector> 21 #include <vector>
22 22
23 #include "base/basictypes.h" 23 #include "base/basictypes.h"
24 #include "base/logging.h" 24 #include "base/logging.h"
25 #include "base/singleton.h" 25 #include "base/singleton.h"
26 #include "base/third_party/dmg_fp/dmg_fp.h" 26 #include "base/third_party/dmg_fp/dmg_fp.h"
27 #include "base/utf_string_conversion_utils.h"
28 #include "base/third_party/icu/icu_utf.h"
29 27
30 namespace { 28 namespace {
31 29
32 // Force the singleton used by Empty[W]String[16] to be a unique type. This 30 // Force the singleton used by Empty[W]String[16] to be a unique type. This
33 // prevents other code that might accidentally use Singleton<string> from 31 // prevents other code that might accidentally use Singleton<string> from
34 // getting our internal one. 32 // getting our internal one.
35 struct EmptyStrings { 33 struct EmptyStrings {
36 EmptyStrings() {} 34 EmptyStrings() {}
37 const std::string s; 35 const std::string s;
38 const std::wstring ws; 36 const std::wstring ws;
(...skipping 567 matching lines...) Expand 10 before | Expand all | Expand 10 after
606 #if !defined(WCHAR_T_IS_UTF16) 604 #if !defined(WCHAR_T_IS_UTF16)
607 bool IsStringASCII(const string16& str) { 605 bool IsStringASCII(const string16& str) {
608 return DoIsStringASCII(str); 606 return DoIsStringASCII(str);
609 } 607 }
610 #endif 608 #endif
611 609
612 bool IsStringASCII(const base::StringPiece& str) { 610 bool IsStringASCII(const base::StringPiece& str) {
613 return DoIsStringASCII(str); 611 return DoIsStringASCII(str);
614 } 612 }
615 613
614 // Helper functions that determine whether the given character begins a
615 // UTF-8 sequence of bytes with the given length. A character satisfies
616 // "IsInUTF8Sequence" if it is anything but the first byte in a multi-byte
617 // character.
618 static inline bool IsBegin2ByteUTF8(int c) {
619 return (c & 0xE0) == 0xC0;
620 }
621 static inline bool IsBegin3ByteUTF8(int c) {
622 return (c & 0xF0) == 0xE0;
623 }
624 static inline bool IsBegin4ByteUTF8(int c) {
625 return (c & 0xF8) == 0xF0;
626 }
627 static inline bool IsInUTF8Sequence(int c) {
628 return (c & 0xC0) == 0x80;
629 }
630
631 // This function was copied from Mozilla, with modifications. The original code
632 // was 'IsUTF8' in xpcom/string/src/nsReadableUtils.cpp. The license block for
633 // this function is:
634 // This function subject to the Mozilla Public License Version
635 // 1.1 (the "License"); you may not use this code except in compliance with
636 // the License. You may obtain a copy of the License at
637 // http://www.mozilla.org/MPL/
638 //
639 // Software distributed under the License is distributed on an "AS IS" basis,
640 // WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
641 // for the specific language governing rights and limitations under the
642 // License.
643 //
644 // The Original Code is mozilla.org code.
645 //
646 // The Initial Developer of the Original Code is
647 // Netscape Communications Corporation.
648 // Portions created by the Initial Developer are Copyright (C) 2000
649 // the Initial Developer. All Rights Reserved.
650 //
651 // Contributor(s):
652 // Scott Collins <scc@mozilla.org> (original author)
653 //
654 // This is a template so that it can be run on wide and 8-bit strings. We want
655 // to run it on wide strings when we have input that we think may have
656 // originally been UTF-8, but has been converted to wide characters because
657 // that's what we (and Windows) use internally.
658 template<typename CHAR>
659 static bool IsStringUTF8T(const CHAR* str, size_t length) {
660 bool overlong = false;
661 bool surrogate = false;
662 bool nonchar = false;
663
664 // overlong byte upper bound
665 typename ToUnsigned<CHAR>::Unsigned olupper = 0;
666
667 // surrogate byte lower bound
668 typename ToUnsigned<CHAR>::Unsigned slower = 0;
669
670 // incremented when inside a multi-byte char to indicate how many bytes
671 // are left in the sequence
672 int positions_left = 0;
673
674 for (uintptr_t i = 0; i < length; i++) {
675 // This whole function assume an unsigned value so force its conversion to
676 // an unsigned value.
677 typename ToUnsigned<CHAR>::Unsigned c = str[i];
678 if (c < 0x80)
679 continue; // ASCII
680
681 if (c <= 0xC1) {
682 // [80-BF] where not expected, [C0-C1] for overlong
683 return false;
684 } else if (IsBegin2ByteUTF8(c)) {
685 positions_left = 1;
686 } else if (IsBegin3ByteUTF8(c)) {
687 positions_left = 2;
688 if (c == 0xE0) {
689 // to exclude E0[80-9F][80-BF]
690 overlong = true;
691 olupper = 0x9F;
692 } else if (c == 0xED) {
693 // ED[A0-BF][80-BF]: surrogate codepoint
694 surrogate = true;
695 slower = 0xA0;
696 } else if (c == 0xEF) {
697 // EF BF [BE-BF] : non-character
698 // TODO(jungshik): EF B7 [90-AF] should be checked as well.
699 nonchar = true;
700 }
701 } else if (c <= 0xF4) {
702 positions_left = 3;
703 nonchar = true;
704 if (c == 0xF0) {
705 // to exclude F0[80-8F][80-BF]{2}
706 overlong = true;
707 olupper = 0x8F;
708 } else if (c == 0xF4) {
709 // to exclude F4[90-BF][80-BF]
710 // actually not surrogates but codepoints beyond 0x10FFFF
711 surrogate = true;
712 slower = 0x90;
713 }
714 } else {
715 return false;
716 }
717
718 // eat the rest of this multi-byte character
719 while (positions_left) {
720 positions_left--;
721 i++;
722 c = str[i];
723 if (!c)
724 return false; // end of string but not end of character sequence
725
726 // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]
727 if (nonchar && ((!positions_left && c < 0xBE) ||
728 (positions_left == 1 && c != 0xBF) ||
729 (positions_left == 2 && 0x0F != (0x0F & c) ))) {
730 nonchar = false;
731 }
732 if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) ||
733 (surrogate && slower <= c) || (nonchar && !positions_left) ) {
734 return false;
735 }
736 overlong = surrogate = false;
737 }
738 }
739 return true;
740 }
741
616 bool IsStringUTF8(const std::string& str) { 742 bool IsStringUTF8(const std::string& str) {
617 const char *src = str.data(); 743 return IsStringUTF8T(str.data(), str.length());
618 int32 src_len = static_cast<int32>(str.length()); 744 }
619 int32 char_index = 0;
620 745
621 while (char_index < src_len) { 746 bool IsStringWideUTF8(const std::wstring& str) {
622 int32 code_point; 747 return IsStringUTF8T(str.data(), str.length());
623 CBU8_NEXT(src, char_index, src_len, code_point);
624 if (!base::IsValidCodepoint(code_point))
625 return false;
626 }
627
628 return true;
629 } 748 }
630 749
631 template<typename Iter> 750 template<typename Iter>
632 static inline bool DoLowerCaseEqualsASCII(Iter a_begin, 751 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
633 Iter a_end, 752 Iter a_end,
634 const char* b) { 753 const char* b) {
635 for (Iter it = a_begin; it != a_end; ++it, ++b) { 754 for (Iter it = a_begin; it != a_end; ++it, ++b) {
636 if (!*b || ToLowerASCII(*it) != *b) 755 if (!*b || ToLowerASCII(*it) != *b)
637 return false; 756 return false;
638 } 757 }
(...skipping 1119 matching lines...) Expand 10 before | Expand all | Expand 10 after
1758 // Each input byte creates two output hex characters. 1877 // Each input byte creates two output hex characters.
1759 std::string ret(size * 2, '\0'); 1878 std::string ret(size * 2, '\0');
1760 1879
1761 for (size_t i = 0; i < size; ++i) { 1880 for (size_t i = 0; i < size; ++i) {
1762 char b = reinterpret_cast<const char*>(bytes)[i]; 1881 char b = reinterpret_cast<const char*>(bytes)[i];
1763 ret[(i * 2)] = kHexChars[(b >> 4) & 0xf]; 1882 ret[(i * 2)] = kHexChars[(b >> 4) & 0xf];
1764 ret[(i * 2) + 1] = kHexChars[b & 0xf]; 1883 ret[(i * 2) + 1] = kHexChars[b & 0xf];
1765 } 1884 }
1766 return ret; 1885 return ret;
1767 } 1886 }
OLDNEW
« no previous file with comments | « base/string_util.h ('k') | base/string_util_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698