| OLD | NEW |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include <math.h> | 5 #include <math.h> |
| 6 #include <stdarg.h> | 6 #include <stdarg.h> |
| 7 | 7 |
| 8 #include <limits> | 8 #include <limits> |
| 9 #include <sstream> | 9 #include <sstream> |
| 10 | 10 |
| (...skipping 24 matching lines...) Expand all Loading... |
| 35 #elif defined(WCHAR_T_IS_UTF32) | 35 #elif defined(WCHAR_T_IS_UTF32) |
| 36 string16 u16; | 36 string16 u16; |
| 37 while (*s != 0) { | 37 while (*s != 0) { |
| 38 DCHECK_LE(static_cast<unsigned int>(*s), 0xFFFFu); | 38 DCHECK_LE(static_cast<unsigned int>(*s), 0xFFFFu); |
| 39 u16.push_back(*s++); | 39 u16.push_back(*s++); |
| 40 } | 40 } |
| 41 return u16; | 41 return u16; |
| 42 #endif | 42 #endif |
| 43 } | 43 } |
| 44 | 44 |
| 45 const wchar_t* const kConvertRoundtripCases[] = { | |
| 46 L"Google Video", | |
| 47 // "网页 图片 资讯更多 »" | |
| 48 L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", | |
| 49 // "Παγκόσμιος Ιστός" | |
| 50 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" | |
| 51 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", | |
| 52 // "Поиск страниц на русском" | |
| 53 L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" | |
| 54 L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" | |
| 55 L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", | |
| 56 // "전체서비스" | |
| 57 L"\xc804\xccb4\xc11c\xbe44\xc2a4", | |
| 58 | |
| 59 // Test characters that take more than 16 bits. This will depend on whether | |
| 60 // wchar_t is 16 or 32 bits. | |
| 61 #if defined(WCHAR_T_IS_UTF16) | |
| 62 L"\xd800\xdf00", | |
| 63 // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) | |
| 64 L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", | |
| 65 #elif defined(WCHAR_T_IS_UTF32) | |
| 66 L"\x10300", | |
| 67 // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) | |
| 68 L"\x11d40\x11d41\x11d42\x11d43\x11d44", | |
| 69 #endif | |
| 70 }; | |
| 71 | |
| 72 } // namespace | 45 } // namespace |
| 73 | 46 |
| 74 TEST(ICUStringConversionsTest, ConvertCodepageUTF8) { | |
| 75 // Make sure WideToCodepage works like WideToUTF8. | |
| 76 for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { | |
| 77 SCOPED_TRACE(base::StringPrintf("Test[%" PRIuS "]: %ls", | |
| 78 i, kConvertRoundtripCases[i])); | |
| 79 | |
| 80 std::string expected(WideToUTF8(kConvertRoundtripCases[i])); | |
| 81 std::string utf8; | |
| 82 EXPECT_TRUE(WideToCodepage(kConvertRoundtripCases[i], kCodepageUTF8, | |
| 83 OnStringConversionError::SKIP, &utf8)); | |
| 84 EXPECT_EQ(expected, utf8); | |
| 85 } | |
| 86 } | |
| 87 | |
| 88 // kConverterCodepageCases is not comprehensive. There are a number of cases | 47 // kConverterCodepageCases is not comprehensive. There are a number of cases |
| 89 // to add if we really want to have a comprehensive coverage of various | 48 // to add if we really want to have a comprehensive coverage of various |
| 90 // codepages and their 'idiosyncrasies'. Currently, the only implementation | 49 // codepages and their 'idiosyncrasies'. Currently, the only implementation |
| 91 // for CodepageTo* and *ToCodepage uses ICU, which has a very extensive | 50 // for CodepageTo* and *ToCodepage uses ICU, which has a very extensive |
| 92 // set of tests for the charset conversion. So, we can get away with a | 51 // set of tests for the charset conversion. So, we can get away with a |
| 93 // relatively small number of cases listed below. | 52 // relatively small number of cases listed below. |
| 94 // | 53 // |
| 95 // Note about |u16_wide| in the following struct. | 54 // Note about |u16_wide| in the following struct. |
| 96 // On Windows, the field is always identical to |wide|. On Mac and Linux, | 55 // On Windows, the field is always identical to |wide|. On Mac and Linux, |
| 97 // it's identical as long as there's no character outside the | 56 // it's identical as long as there's no character outside the |
| (...skipping 128 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 226 // Thai (windows-874) | 185 // Thai (windows-874) |
| 227 {"windows-874", | 186 {"windows-874", |
| 228 "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA", | 187 "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA", |
| 229 OnStringConversionError::FAIL, | 188 OnStringConversionError::FAIL, |
| 230 true, | 189 true, |
| 231 L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35" | 190 L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35" |
| 232 L"\x0E04\x0E23\x0e31\x0E1A", | 191 L"\x0E04\x0E23\x0e31\x0E1A", |
| 233 NULL}, | 192 NULL}, |
| 234 }; | 193 }; |
| 235 | 194 |
| 236 TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndWide) { | |
| 237 for (size_t i = 0; i < arraysize(kConvertCodepageCases); ++i) { | |
| 238 SCOPED_TRACE(base::StringPrintf( | |
| 239 "Test[%" PRIuS "]: <encoded: %s> <codepage: %s>", i, | |
| 240 kConvertCodepageCases[i].encoded, | |
| 241 kConvertCodepageCases[i].codepage_name)); | |
| 242 | |
| 243 std::wstring wide; | |
| 244 bool success = CodepageToWide(kConvertCodepageCases[i].encoded, | |
| 245 kConvertCodepageCases[i].codepage_name, | |
| 246 kConvertCodepageCases[i].on_error, | |
| 247 &wide); | |
| 248 EXPECT_EQ(kConvertCodepageCases[i].success, success); | |
| 249 EXPECT_EQ(kConvertCodepageCases[i].wide, wide); | |
| 250 | |
| 251 // When decoding was successful and nothing was skipped, we also check the | |
| 252 // reverse conversion. Not all conversions are round-trippable, but | |
| 253 // kConverterCodepageCases does not have any one-way conversion at the | |
| 254 // moment. | |
| 255 if (success && | |
| 256 kConvertCodepageCases[i].on_error == | |
| 257 OnStringConversionError::FAIL) { | |
| 258 std::string encoded; | |
| 259 success = WideToCodepage(wide, kConvertCodepageCases[i].codepage_name, | |
| 260 kConvertCodepageCases[i].on_error, &encoded); | |
| 261 EXPECT_EQ(kConvertCodepageCases[i].success, success); | |
| 262 EXPECT_EQ(kConvertCodepageCases[i].encoded, encoded); | |
| 263 } | |
| 264 } | |
| 265 | |
| 266 // The above cases handled codepage->wide errors, but not wide->codepage. | |
| 267 // Test that here. | |
| 268 std::string encoded("Temp data"); // Make sure the string gets cleared. | |
| 269 | |
| 270 // First test going to an encoding that can not represent that character. | |
| 271 EXPECT_FALSE(WideToCodepage(L"Chinese\xff27", "iso-8859-1", | |
| 272 OnStringConversionError::FAIL, &encoded)); | |
| 273 EXPECT_TRUE(encoded.empty()); | |
| 274 EXPECT_TRUE(WideToCodepage(L"Chinese\xff27", "iso-8859-1", | |
| 275 OnStringConversionError::SKIP, &encoded)); | |
| 276 EXPECT_STREQ("Chinese", encoded.c_str()); | |
| 277 // From Unicode, SUBSTITUTE is the same as SKIP for now. | |
| 278 EXPECT_TRUE(WideToCodepage(L"Chinese\xff27", "iso-8859-1", | |
| 279 OnStringConversionError::SUBSTITUTE, | |
| 280 &encoded)); | |
| 281 EXPECT_STREQ("Chinese", encoded.c_str()); | |
| 282 | |
| 283 #if defined(WCHAR_T_IS_UTF16) | |
| 284 // When we're in UTF-16 mode, test an invalid UTF-16 character in the input. | |
| 285 EXPECT_FALSE(WideToCodepage(L"a\xd800z", "iso-8859-1", | |
| 286 OnStringConversionError::FAIL, &encoded)); | |
| 287 EXPECT_TRUE(encoded.empty()); | |
| 288 EXPECT_TRUE(WideToCodepage(L"a\xd800z", "iso-8859-1", | |
| 289 OnStringConversionError::SKIP, &encoded)); | |
| 290 EXPECT_STREQ("az", encoded.c_str()); | |
| 291 #endif // WCHAR_T_IS_UTF16 | |
| 292 | |
| 293 // Invalid characters should fail. | |
| 294 EXPECT_TRUE(WideToCodepage(L"a\xffffz", "iso-8859-1", | |
| 295 OnStringConversionError::SKIP, &encoded)); | |
| 296 EXPECT_STREQ("az", encoded.c_str()); | |
| 297 | |
| 298 // Invalid codepages should fail. | |
| 299 EXPECT_FALSE(WideToCodepage(L"Hello, world", "awesome-8571-2", | |
| 300 OnStringConversionError::SKIP, &encoded)); | |
| 301 } | |
| 302 | |
| 303 TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndUTF16) { | 195 TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndUTF16) { |
| 304 for (size_t i = 0; i < arraysize(kConvertCodepageCases); ++i) { | 196 for (size_t i = 0; i < arraysize(kConvertCodepageCases); ++i) { |
| 305 SCOPED_TRACE(base::StringPrintf( | 197 SCOPED_TRACE(base::StringPrintf( |
| 306 "Test[%" PRIuS "]: <encoded: %s> <codepage: %s>", i, | 198 "Test[%" PRIuS "]: <encoded: %s> <codepage: %s>", i, |
| 307 kConvertCodepageCases[i].encoded, | 199 kConvertCodepageCases[i].encoded, |
| 308 kConvertCodepageCases[i].codepage_name)); | 200 kConvertCodepageCases[i].codepage_name)); |
| 309 | 201 |
| 310 string16 utf16; | 202 string16 utf16; |
| 311 bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded, | 203 bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded, |
| 312 kConvertCodepageCases[i].codepage_name, | 204 kConvertCodepageCases[i].codepage_name, |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 364 | 256 |
| 365 bool success = ConvertToUtf8AndNormalize( | 257 bool success = ConvertToUtf8AndNormalize( |
| 366 kConvertAndNormalizeCases[i].encoded, | 258 kConvertAndNormalizeCases[i].encoded, |
| 367 kConvertAndNormalizeCases[i].codepage_name, &result); | 259 kConvertAndNormalizeCases[i].codepage_name, &result); |
| 368 EXPECT_EQ(kConvertAndNormalizeCases[i].expected_success, success); | 260 EXPECT_EQ(kConvertAndNormalizeCases[i].expected_success, success); |
| 369 EXPECT_EQ(kConvertAndNormalizeCases[i].expected_value, result); | 261 EXPECT_EQ(kConvertAndNormalizeCases[i].expected_value, result); |
| 370 } | 262 } |
| 371 } | 263 } |
| 372 | 264 |
| 373 } // namespace base | 265 } // namespace base |
| OLD | NEW |