Index: base/string_util_unittest.cc |
=================================================================== |
--- base/string_util_unittest.cc (revision 19007) |
+++ base/string_util_unittest.cc (working copy) |
@@ -309,8 +309,8 @@ |
} convert_cases[] = { |
// Regular UTF-8 input. |
{"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, |
- // Invalid Unicode code point. |
- {"\xef\xbf\xbfHello", L"Hello", false}, |
+ // Non-character is passed through. |
+ {"\xef\xbf\xbfHello", L"\xffffHello", true}, |
// Truncated UTF-8 sequence. |
{"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false}, |
// Truncated off the end. |
@@ -319,11 +319,14 @@ |
{"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, |
// This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. |
{"\xed\xb0\x80", L"", false}, |
- // Non-BMP character. The result will either be in UTF-16 or UTF-32. |
+ // Non-BMP characters. The second is a non-character regarded as valid. |
+ // The result will either be in UTF-16 or UTF-32. |
#if defined(WCHAR_T_IS_UTF16) |
{"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, |
+ {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true}, |
#elif defined(WCHAR_T_IS_UTF32) |
{"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, |
+ {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true}, |
#endif |
}; |
@@ -367,8 +370,9 @@ |
{L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, |
// Test a non-BMP character. |
{L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, |
- // Invalid Unicode code point. |
- {L"\xffffHello", "Hello", false}, |
+ // Non-characters are passed through. |
+ {L"\xffffHello", "\xEF\xBF\xBFHello", true}, |
+ {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true}, |
// The first character is a truncated UTF-16 character. |
{L"\xd800\x597d", "\xe5\xa5\xbd", false}, |
// Truncated at the end. |
@@ -389,7 +393,7 @@ |
#elif defined(WCHAR_T_IS_UTF32) |
// This test is only valid when wchar_t == UTF-32. |
TEST(StringUtilTest, ConvertUTF32ToUTF8) { |
- struct UTF8ToWideCase { |
+ struct WideToUTF8Case { |
const wchar_t* utf32; |
const char* utf8; |
bool success; |
@@ -398,11 +402,14 @@ |
{L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, |
// Test a non-BMP character. |
{L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, |
+ // Non-characters are passed through. |
+ {L"\xffffHello", "\xEF\xBF\xBFHello", true}, |
+ {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true}, |
// Invalid Unicode code points. |
- {L"\xffffHello", "Hello", false}, |
{L"\xfffffffHello", "Hello", false}, |
// The first character is a truncated UTF-16 character. |
{L"\xd800\x597d", "\xe5\xa5\xbd", false}, |
+ {L"\xdc01Hello", "Hello", false}, |
}; |
for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { |