Index: base/string_util.cc |
=================================================================== |
--- base/string_util.cc (revision 40171) |
+++ base/string_util.cc (working copy) |
@@ -24,6 +24,8 @@ |
#include "base/logging.h" |
#include "base/singleton.h" |
#include "base/third_party/dmg_fp/dmg_fp.h" |
+#include "base/utf_string_conversion_utils.h" |
+#include "base/third_party/icu/icu_utf.h" |
namespace { |
@@ -676,142 +678,20 @@ |
return DoIsStringASCII(str); |
} |
-// Helper functions that determine whether the given character begins a |
-// UTF-8 sequence of bytes with the given length. A character satisfies |
-// "IsInUTF8Sequence" if it is anything but the first byte in a multi-byte |
-// character. |
-static inline bool IsBegin2ByteUTF8(int c) { |
- return (c & 0xE0) == 0xC0; |
-} |
-static inline bool IsBegin3ByteUTF8(int c) { |
- return (c & 0xF0) == 0xE0; |
-} |
-static inline bool IsBegin4ByteUTF8(int c) { |
- return (c & 0xF8) == 0xF0; |
-} |
-static inline bool IsInUTF8Sequence(int c) { |
- return (c & 0xC0) == 0x80; |
-} |
+bool IsStringUTF8(const std::string& str) { |
+ const char *src = str.data(); |
+ int32 src_len = static_cast<int32>(str.length()); |
+ int32 char_index = 0; |
-// This function was copied from Mozilla, with modifications. The original code |
-// was 'IsUTF8' in xpcom/string/src/nsReadableUtils.cpp. The license block for |
-// this function is: |
-// This function subject to the Mozilla Public License Version |
-// 1.1 (the "License"); you may not use this code except in compliance with |
-// the License. You may obtain a copy of the License at |
-// http://www.mozilla.org/MPL/ |
-// |
-// Software distributed under the License is distributed on an "AS IS" basis, |
-// WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
-// for the specific language governing rights and limitations under the |
-// License. |
-// |
-// The Original Code is mozilla.org code. |
-// |
-// The Initial Developer of the Original Code is |
-// Netscape Communications Corporation. |
-// Portions created by the Initial Developer are Copyright (C) 2000 |
-// the Initial Developer. All Rights Reserved. |
-// |
-// Contributor(s): |
-// Scott Collins <scc@mozilla.org> (original author) |
-// |
-// This is a template so that it can be run on wide and 8-bit strings. We want |
-// to run it on wide strings when we have input that we think may have |
-// originally been UTF-8, but has been converted to wide characters because |
-// that's what we (and Windows) use internally. |
-template<typename CHAR> |
-static bool IsStringUTF8T(const CHAR* str, size_t length) { |
- bool overlong = false; |
- bool surrogate = false; |
- bool nonchar = false; |
- |
- // overlong byte upper bound |
- typename ToUnsigned<CHAR>::Unsigned olupper = 0; |
- |
- // surrogate byte lower bound |
- typename ToUnsigned<CHAR>::Unsigned slower = 0; |
- |
- // incremented when inside a multi-byte char to indicate how many bytes |
- // are left in the sequence |
- int positions_left = 0; |
- |
- for (uintptr_t i = 0; i < length; i++) { |
- // This whole function assume an unsigned value so force its conversion to |
- // an unsigned value. |
- typename ToUnsigned<CHAR>::Unsigned c = str[i]; |
- if (c < 0x80) |
- continue; // ASCII |
- |
- if (c <= 0xC1) { |
- // [80-BF] where not expected, [C0-C1] for overlong |
- return false; |
- } else if (IsBegin2ByteUTF8(c)) { |
- positions_left = 1; |
- } else if (IsBegin3ByteUTF8(c)) { |
- positions_left = 2; |
- if (c == 0xE0) { |
- // to exclude E0[80-9F][80-BF] |
- overlong = true; |
- olupper = 0x9F; |
- } else if (c == 0xED) { |
- // ED[A0-BF][80-BF]: surrogate codepoint |
- surrogate = true; |
- slower = 0xA0; |
- } else if (c == 0xEF) { |
- // EF BF [BE-BF] : non-character |
- // TODO(jungshik): EF B7 [90-AF] should be checked as well. |
- nonchar = true; |
- } |
- } else if (c <= 0xF4) { |
- positions_left = 3; |
- nonchar = true; |
- if (c == 0xF0) { |
- // to exclude F0[80-8F][80-BF]{2} |
- overlong = true; |
- olupper = 0x8F; |
- } else if (c == 0xF4) { |
- // to exclude F4[90-BF][80-BF] |
- // actually not surrogates but codepoints beyond 0x10FFFF |
- surrogate = true; |
- slower = 0x90; |
- } |
- } else { |
- return false; |
- } |
- |
- // eat the rest of this multi-byte character |
- while (positions_left) { |
- positions_left--; |
- i++; |
- c = str[i]; |
- if (!c) |
- return false; // end of string but not end of character sequence |
- |
- // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF] |
- if (nonchar && ((!positions_left && c < 0xBE) || |
- (positions_left == 1 && c != 0xBF) || |
- (positions_left == 2 && 0x0F != (0x0F & c) ))) { |
- nonchar = false; |
- } |
- if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) || |
- (surrogate && slower <= c) || (nonchar && !positions_left) ) { |
- return false; |
- } |
- overlong = surrogate = false; |
- } |
+ while (char_index < src_len) { |
+ int32 code_point; |
+ CBU8_NEXT(src, char_index, src_len, code_point); |
+ if (!base::IsValidCharacter(code_point)) |
+ return false; |
} |
return true; |
} |
-bool IsStringUTF8(const std::string& str) { |
- return IsStringUTF8T(str.data(), str.length()); |
-} |
- |
-bool IsStringWideUTF8(const std::wstring& str) { |
- return IsStringUTF8T(str.data(), str.length()); |
-} |
- |
template<typename Iter> |
static inline bool DoLowerCaseEqualsASCII(Iter a_begin, |
Iter a_end, |