| OLD | NEW | 
|---|
| 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. | 
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be | 
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. | 
| 4 | 4 | 
| 5 #include <math.h> | 5 #include <math.h> | 
| 6 #include <stdarg.h> | 6 #include <stdarg.h> | 
| 7 | 7 | 
| 8 #include <limits> | 8 #include <limits> | 
| 9 #include <sstream> | 9 #include <sstream> | 
| 10 | 10 | 
| 11 #include "base/basictypes.h" | 11 #include "base/basictypes.h" | 
| (...skipping 210 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 222   EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2")); // U+AC00, U+AC001 in EUC-KR | 222   EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2")); // U+AC00, U+AC001 in EUC-KR | 
| 223   EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e")); // U+4F60 U+597D in Big5 | 223   EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e")); // U+4F60 U+597D in Big5 | 
| 224   // "abc" with U+201[CD] in windows-125[0-8] | 224   // "abc" with U+201[CD] in windows-125[0-8] | 
| 225   EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94")); | 225   EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94")); | 
| 226   // U+0639 U+064E U+0644 U+064E in ISO-8859-6 | 226   // U+0639 U+064E U+0644 U+064E in ISO-8859-6 | 
| 227   EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee")); | 227   EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee")); | 
| 228   // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7 | 228   // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7 | 
| 229   EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC")); | 229   EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC")); | 
| 230 } | 230 } | 
| 231 | 231 | 
| 232 static const wchar_t* const kConvertRoundtripCases[] = { |  | 
| 233   L"Google Video", |  | 
| 234   // "网页 图片 资讯更多 »" |  | 
| 235   L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", |  | 
| 236   //  "Παγκόσμιος Ιστός" |  | 
| 237   L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" |  | 
| 238   L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", |  | 
| 239   // "Поиск страниц на русском" |  | 
| 240   L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" |  | 
| 241   L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" |  | 
| 242   L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", |  | 
| 243   // "전체서비스" |  | 
| 244   L"\xc804\xccb4\xc11c\xbe44\xc2a4", |  | 
| 245 |  | 
| 246   // Test characters that take more than 16 bits. This will depend on whether |  | 
| 247   // wchar_t is 16 or 32 bits. |  | 
| 248 #if defined(WCHAR_T_IS_UTF16) |  | 
| 249   L"\xd800\xdf00", |  | 
| 250   // ?????  (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) |  | 
| 251   L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", |  | 
| 252 #elif defined(WCHAR_T_IS_UTF32) |  | 
| 253   L"\x10300", |  | 
| 254   // ?????  (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) |  | 
| 255   L"\x11d40\x11d41\x11d42\x11d43\x11d44", |  | 
| 256 #endif |  | 
| 257 }; |  | 
| 258 |  | 
| 259 TEST(StringUtilTest, ConvertUTF8AndWide) { |  | 
| 260   // we round-trip all the wide strings through UTF-8 to make sure everything |  | 
| 261   // agrees on the conversion. This uses the stream operators to test them |  | 
| 262   // simultaneously. |  | 
| 263   for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { |  | 
| 264     std::ostringstream utf8; |  | 
| 265     utf8 << WideToUTF8(kConvertRoundtripCases[i]); |  | 
| 266     std::wostringstream wide; |  | 
| 267     wide << UTF8ToWide(utf8.str()); |  | 
| 268 |  | 
| 269     EXPECT_EQ(kConvertRoundtripCases[i], wide.str()); |  | 
| 270   } |  | 
| 271 } |  | 
| 272 |  | 
| 273 TEST(StringUtilTest, ConvertUTF8AndWideEmptyString) { |  | 
| 274   // An empty std::wstring should be converted to an empty std::string, |  | 
| 275   // and vice versa. |  | 
| 276   std::wstring wempty; |  | 
| 277   std::string empty; |  | 
| 278   EXPECT_EQ(empty, WideToUTF8(wempty)); |  | 
| 279   EXPECT_EQ(wempty, UTF8ToWide(empty)); |  | 
| 280 } |  | 
| 281 |  | 
| 282 TEST(StringUtilTest, ConvertUTF8ToWide) { |  | 
| 283   struct UTF8ToWideCase { |  | 
| 284     const char* utf8; |  | 
| 285     const wchar_t* wide; |  | 
| 286     bool success; |  | 
| 287   } convert_cases[] = { |  | 
| 288     // Regular UTF-8 input. |  | 
| 289     {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, |  | 
| 290     // Non-character is passed through. |  | 
| 291     {"\xef\xbf\xbfHello", L"\xffffHello", true}, |  | 
| 292     // Truncated UTF-8 sequence. |  | 
| 293     {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false}, |  | 
| 294     // Truncated off the end. |  | 
| 295     {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false}, |  | 
| 296     // Non-shortest-form UTF-8. |  | 
| 297     {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, |  | 
| 298     // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. |  | 
| 299     {"\xed\xb0\x80", L"", false}, |  | 
| 300     // Non-BMP characters. The second is a non-character regarded as valid. |  | 
| 301     // The result will either be in UTF-16 or UTF-32. |  | 
| 302 #if defined(WCHAR_T_IS_UTF16) |  | 
| 303     {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, |  | 
| 304     {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true}, |  | 
| 305 #elif defined(WCHAR_T_IS_UTF32) |  | 
| 306     {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, |  | 
| 307     {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true}, |  | 
| 308 #endif |  | 
| 309   }; |  | 
| 310 |  | 
| 311   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { |  | 
| 312     std::wstring converted; |  | 
| 313     EXPECT_EQ(convert_cases[i].success, |  | 
| 314               UTF8ToWide(convert_cases[i].utf8, |  | 
| 315                          strlen(convert_cases[i].utf8), |  | 
| 316                          &converted)); |  | 
| 317     std::wstring expected(convert_cases[i].wide); |  | 
| 318     EXPECT_EQ(expected, converted); |  | 
| 319   } |  | 
| 320 |  | 
| 321   // Manually test an embedded NULL. |  | 
| 322   std::wstring converted; |  | 
| 323   EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted)); |  | 
| 324   ASSERT_EQ(3U, converted.length()); |  | 
| 325 #if defined(WCHAR_T_IS_UNSIGNED) |  | 
| 326   EXPECT_EQ(0U, converted[0]); |  | 
| 327 #else |  | 
| 328   EXPECT_EQ(0, converted[0]); |  | 
| 329 #endif |  | 
| 330   EXPECT_EQ('Z', converted[1]); |  | 
| 331   EXPECT_EQ('\t', converted[2]); |  | 
| 332 |  | 
| 333   // Make sure that conversion replaces, not appends. |  | 
| 334   EXPECT_TRUE(UTF8ToWide("B", 1, &converted)); |  | 
| 335   ASSERT_EQ(1U, converted.length()); |  | 
| 336   EXPECT_EQ('B', converted[0]); |  | 
| 337 } |  | 
| 338 |  | 
| 339 #if defined(WCHAR_T_IS_UTF16) |  | 
| 340 // This test is only valid when wchar_t == UTF-16. |  | 
| 341 TEST(StringUtilTest, ConvertUTF16ToUTF8) { |  | 
| 342   struct UTF16ToUTF8Case { |  | 
| 343     const wchar_t* utf16; |  | 
| 344     const char* utf8; |  | 
| 345     bool success; |  | 
| 346   } convert_cases[] = { |  | 
| 347     // Regular UTF-16 input. |  | 
| 348     {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, |  | 
| 349     // Test a non-BMP character. |  | 
| 350     {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, |  | 
| 351     // Non-characters are passed through. |  | 
| 352     {L"\xffffHello", "\xEF\xBF\xBFHello", true}, |  | 
| 353     {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true}, |  | 
| 354     // The first character is a truncated UTF-16 character. |  | 
| 355     {L"\xd800\x597d", "\xe5\xa5\xbd", false}, |  | 
| 356     // Truncated at the end. |  | 
| 357     {L"\x597d\xd800", "\xe5\xa5\xbd", false}, |  | 
| 358   }; |  | 
| 359 |  | 
| 360   for (int i = 0; i < arraysize(convert_cases); i++) { |  | 
| 361     std::string converted; |  | 
| 362     EXPECT_EQ(convert_cases[i].success, |  | 
| 363               WideToUTF8(convert_cases[i].utf16, |  | 
| 364                          wcslen(convert_cases[i].utf16), |  | 
| 365                          &converted)); |  | 
| 366     std::string expected(convert_cases[i].utf8); |  | 
| 367     EXPECT_EQ(expected, converted); |  | 
| 368   } |  | 
| 369 } |  | 
| 370 |  | 
| 371 #elif defined(WCHAR_T_IS_UTF32) |  | 
| 372 // This test is only valid when wchar_t == UTF-32. |  | 
| 373 TEST(StringUtilTest, ConvertUTF32ToUTF8) { |  | 
| 374   struct WideToUTF8Case { |  | 
| 375     const wchar_t* utf32; |  | 
| 376     const char* utf8; |  | 
| 377     bool success; |  | 
| 378   } convert_cases[] = { |  | 
| 379     // Regular 16-bit input. |  | 
| 380     {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, |  | 
| 381     // Test a non-BMP character. |  | 
| 382     {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, |  | 
| 383     // Non-characters are passed through. |  | 
| 384     {L"\xffffHello", "\xEF\xBF\xBFHello", true}, |  | 
| 385     {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true}, |  | 
| 386     // Invalid Unicode code points. |  | 
| 387     {L"\xfffffffHello", "Hello", false}, |  | 
| 388     // The first character is a truncated UTF-16 character. |  | 
| 389     {L"\xd800\x597d", "\xe5\xa5\xbd", false}, |  | 
| 390     {L"\xdc01Hello", "Hello", false}, |  | 
| 391   }; |  | 
| 392 |  | 
| 393   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { |  | 
| 394     std::string converted; |  | 
| 395     EXPECT_EQ(convert_cases[i].success, |  | 
| 396               WideToUTF8(convert_cases[i].utf32, |  | 
| 397                          wcslen(convert_cases[i].utf32), |  | 
| 398                          &converted)); |  | 
| 399     std::string expected(convert_cases[i].utf8); |  | 
| 400     EXPECT_EQ(expected, converted); |  | 
| 401   } |  | 
| 402 } |  | 
| 403 #endif  // defined(WCHAR_T_IS_UTF32) |  | 
| 404 |  | 
| 405 TEST(StringUtilTest, ConvertMultiString) { |  | 
| 406   static wchar_t wmulti[] = { |  | 
| 407     L'f', L'o', L'o', L'\0', |  | 
| 408     L'b', L'a', L'r', L'\0', |  | 
| 409     L'b', L'a', L'z', L'\0', |  | 
| 410     L'\0' |  | 
| 411   }; |  | 
| 412   static char multi[] = { |  | 
| 413     'f', 'o', 'o', '\0', |  | 
| 414     'b', 'a', 'r', '\0', |  | 
| 415     'b', 'a', 'z', '\0', |  | 
| 416     '\0' |  | 
| 417   }; |  | 
| 418   std::wstring wmultistring; |  | 
| 419   memcpy(WriteInto(&wmultistring, arraysize(wmulti)), wmulti, sizeof(wmulti)); |  | 
| 420   EXPECT_EQ(arraysize(wmulti) - 1, wmultistring.length()); |  | 
| 421   std::string expected; |  | 
| 422   memcpy(WriteInto(&expected, arraysize(multi)), multi, sizeof(multi)); |  | 
| 423   EXPECT_EQ(arraysize(multi) - 1, expected.length()); |  | 
| 424   const std::string& converted = WideToUTF8(wmultistring); |  | 
| 425   EXPECT_EQ(arraysize(multi) - 1, converted.length()); |  | 
| 426   EXPECT_EQ(expected, converted); |  | 
| 427 } |  | 
| 428 |  | 
| 429 TEST(StringUtilTest, ConvertASCII) { | 232 TEST(StringUtilTest, ConvertASCII) { | 
| 430   static const char* char_cases[] = { | 233   static const char* char_cases[] = { | 
| 431     "Google Video", | 234     "Google Video", | 
| 432     "Hello, world\n", | 235     "Hello, world\n", | 
| 433     "0123ABCDwxyz \a\b\t\r\n!+,.~" | 236     "0123ABCDwxyz \a\b\t\r\n!+,.~" | 
| 434   }; | 237   }; | 
| 435 | 238 | 
| 436   static const wchar_t* const wchar_cases[] = { | 239   static const wchar_t* const wchar_cases[] = { | 
| 437     L"Google Video", | 240     L"Google Video", | 
| 438     L"Hello, world\n", | 241     L"Hello, world\n", | 
| (...skipping 1100 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 1539 | 1342 | 
| 1540 TEST(StringUtilTest, HexEncode) { | 1343 TEST(StringUtilTest, HexEncode) { | 
| 1541   std::string hex(HexEncode(NULL, 0)); | 1344   std::string hex(HexEncode(NULL, 0)); | 
| 1542   EXPECT_EQ(hex.length(), 0U); | 1345   EXPECT_EQ(hex.length(), 0U); | 
| 1543   unsigned char bytes[] = {0x01, 0xff, 0x02, 0xfe, 0x03, 0x80, 0x81}; | 1346   unsigned char bytes[] = {0x01, 0xff, 0x02, 0xfe, 0x03, 0x80, 0x81}; | 
| 1544   hex = HexEncode(bytes, sizeof(bytes)); | 1347   hex = HexEncode(bytes, sizeof(bytes)); | 
| 1545   EXPECT_EQ(hex.compare("01FF02FE038081"), 0); | 1348   EXPECT_EQ(hex.compare("01FF02FE038081"), 0); | 
| 1546 } | 1349 } | 
| 1547 | 1350 | 
| 1548 }  // namaspace base | 1351 }  // namaspace base | 
| OLD | NEW | 
|---|