OLD | NEW |
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include <math.h> | 5 #include <math.h> |
6 #include <stdarg.h> | 6 #include <stdarg.h> |
7 | 7 |
8 #include <limits> | 8 #include <limits> |
9 #include <sstream> | 9 #include <sstream> |
10 | 10 |
11 #include "base/basictypes.h" | 11 #include "base/basictypes.h" |
| 12 #include "base/i18n/icu_string_conversions.h" |
12 #include "base/logging.h" | 13 #include "base/logging.h" |
13 #include "base/utf_string_conversions.h" | 14 #include "base/utf_string_conversions.h" |
14 #include "base/i18n/icu_string_conversions.h" | |
15 #include "testing/gtest/include/gtest/gtest.h" | 15 #include "testing/gtest/include/gtest/gtest.h" |
16 | 16 |
17 namespace base { | 17 namespace base { |
18 | 18 |
19 namespace { | 19 namespace { |
20 | 20 |
21 // Given a null-terminated string of wchar_t with each wchar_t representing | 21 // Given a null-terminated string of wchar_t with each wchar_t representing |
22 // a UTF-16 code unit, returns a string16 made up of wchar_t's in the input. | 22 // a UTF-16 code unit, returns a string16 made up of wchar_t's in the input. |
23 // Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF) | 23 // Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF) |
24 // should be represented as a surrogate pair (two UTF-16 units) | 24 // should be represented as a surrogate pair (two UTF-16 units) |
25 // *even* where wchar_t is 32-bit (Linux and Mac). | 25 // *even* where wchar_t is 32-bit (Linux and Mac). |
26 // | 26 // |
27 // This is to help write tests for functions with string16 params until | 27 // This is to help write tests for functions with string16 params until |
28 // the C++ 0x UTF-16 literal is well-supported by compilers. | 28 // the C++ 0x UTF-16 literal is well-supported by compilers. |
29 string16 BuildString16(const wchar_t* s) { | 29 string16 BuildString16(const wchar_t* s) { |
30 #if defined(WCHAR_T_IS_UTF16) | 30 #if defined(WCHAR_T_IS_UTF16) |
31 return string16(s); | 31 return string16(s); |
32 #elif defined(WCHAR_T_IS_UTF32) | 32 #elif defined(WCHAR_T_IS_UTF32) |
33 string16 u16; | 33 string16 u16; |
34 while (*s != 0) { | 34 while (*s != 0) { |
35 DCHECK(static_cast<unsigned int>(*s) <= 0xFFFFu); | 35 DCHECK(static_cast<unsigned int>(*s) <= 0xFFFFu); |
36 u16.push_back(*s++); | 36 u16.push_back(*s++); |
37 } | 37 } |
38 return u16; | 38 return u16; |
39 #endif | 39 #endif |
40 } | 40 } |
41 | 41 |
42 static const wchar_t* const kConvertRoundtripCases[] = { | 42 const wchar_t* const kConvertRoundtripCases[] = { |
43 L"Google Video", | 43 L"Google Video", |
44 // "网页 图片 资讯更多 »" | 44 // "网页 图片 资讯更多 »" |
45 L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", | 45 L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", |
46 // "Παγκόσμιος Ιστός" | 46 // "Παγκόσμιος Ιστός" |
47 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" | 47 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" |
48 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", | 48 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", |
49 // "Поиск страниц на русском" | 49 // "Поиск страниц на русском" |
50 L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" | 50 L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" |
51 L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" | 51 L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" |
52 L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", | 52 L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", |
53 // "전체서비스" | 53 // "전체서비스" |
54 L"\xc804\xccb4\xc11c\xbe44\xc2a4", | 54 L"\xc804\xccb4\xc11c\xbe44\xc2a4", |
55 | 55 |
56 // Test characters that take more than 16 bits. This will depend on whether | 56 // Test characters that take more than 16 bits. This will depend on whether |
57 // wchar_t is 16 or 32 bits. | 57 // wchar_t is 16 or 32 bits. |
58 #if defined(WCHAR_T_IS_UTF16) | 58 #if defined(WCHAR_T_IS_UTF16) |
59 L"\xd800\xdf00", | 59 L"\xd800\xdf00", |
60 // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) | 60 // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) |
61 L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", | 61 L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", |
62 #elif defined(WCHAR_T_IS_UTF32) | 62 #elif defined(WCHAR_T_IS_UTF32) |
63 L"\x10300", | 63 L"\x10300", |
64 // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) | 64 // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) |
65 L"\x11d40\x11d41\x11d42\x11d43\x11d44", | 65 L"\x11d40\x11d41\x11d42\x11d43\x11d44", |
66 #endif | 66 #endif |
67 }; | 67 }; |
68 | 68 |
69 } // namespace | 69 } // namespace |
70 | 70 |
71 TEST(StringUtilTest, ConvertCodepageUTF8) { | 71 TEST(ICUStringConversionsTest, ConvertCodepageUTF8) { |
72 // Make sure WideToCodepage works like WideToUTF8. | 72 // Make sure WideToCodepage works like WideToUTF8. |
73 for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { | 73 for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { |
74 std::string expected(WideToUTF8(kConvertRoundtripCases[i])); | 74 std::string expected(WideToUTF8(kConvertRoundtripCases[i])); |
75 std::string utf8; | 75 std::string utf8; |
76 EXPECT_TRUE(WideToCodepage(kConvertRoundtripCases[i], kCodepageUTF8, | 76 EXPECT_TRUE(WideToCodepage(kConvertRoundtripCases[i], kCodepageUTF8, |
77 OnStringConversionError::SKIP, &utf8)); | 77 OnStringConversionError::SKIP, &utf8)); |
78 EXPECT_EQ(expected, utf8); | 78 EXPECT_EQ(expected, utf8); |
79 } | 79 } |
80 } | 80 } |
81 | 81 |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
149 true, | 149 true, |
150 L"\x00A5\x00A8", | 150 L"\x00A5\x00A8", |
151 NULL}, | 151 NULL}, |
152 // Chinese (GB18030) : A 4 byte sequence mapped to plane 2 (U+20000) | 152 // Chinese (GB18030) : A 4 byte sequence mapped to plane 2 (U+20000) |
153 {"gb18030", | 153 {"gb18030", |
154 "\x95\x32\x82\x36\xD2\xBB", | 154 "\x95\x32\x82\x36\xD2\xBB", |
155 OnStringConversionError::FAIL, | 155 OnStringConversionError::FAIL, |
156 true, | 156 true, |
157 #if defined(WCHAR_T_IS_UTF16) | 157 #if defined(WCHAR_T_IS_UTF16) |
158 L"\xD840\xDC00\x4E00", | 158 L"\xD840\xDC00\x4E00", |
159 #else | 159 #elif defined(WCHAR_T_IS_UTF32) |
160 L"\x20000\x4E00", | 160 L"\x20000\x4E00", |
161 #endif | 161 #endif |
162 L"\xD840\xDC00\x4E00"}, | 162 L"\xD840\xDC00\x4E00"}, |
163 {"big5", | 163 {"big5", |
164 "\xA7\x41\xA6\x6E", | 164 "\xA7\x41\xA6\x6E", |
165 OnStringConversionError::FAIL, | 165 OnStringConversionError::FAIL, |
166 true, | 166 true, |
167 L"\x4F60\x597D", | 167 L"\x4F60\x597D", |
168 NULL}, | 168 NULL}, |
169 // Greek (ISO-8859) | 169 // Greek (ISO-8859) |
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
227 // Thai (windows-874) | 227 // Thai (windows-874) |
228 {"windows-874", | 228 {"windows-874", |
229 "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA", | 229 "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA", |
230 OnStringConversionError::FAIL, | 230 OnStringConversionError::FAIL, |
231 true, | 231 true, |
232 L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35" | 232 L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35" |
233 L"\x0E04\x0E23\x0e31\x0E1A", | 233 L"\x0E04\x0E23\x0e31\x0E1A", |
234 NULL}, | 234 NULL}, |
235 }; | 235 }; |
236 | 236 |
237 TEST(StringUtilTest, ConvertBetweenCodepageAndWide) { | 237 TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndWide) { |
238 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { | 238 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { |
239 std::wstring wide; | 239 std::wstring wide; |
240 bool success = CodepageToWide(kConvertCodepageCases[i].encoded, | 240 bool success = CodepageToWide(kConvertCodepageCases[i].encoded, |
241 kConvertCodepageCases[i].codepage_name, | 241 kConvertCodepageCases[i].codepage_name, |
242 kConvertCodepageCases[i].on_error, | 242 kConvertCodepageCases[i].on_error, |
243 &wide); | 243 &wide); |
244 EXPECT_EQ(kConvertCodepageCases[i].success, success); | 244 EXPECT_EQ(kConvertCodepageCases[i].success, success); |
245 EXPECT_EQ(kConvertCodepageCases[i].wide, wide); | 245 EXPECT_EQ(kConvertCodepageCases[i].wide, wide); |
246 | 246 |
247 // When decoding was successful and nothing was skipped, we also check the | 247 // When decoding was successful and nothing was skipped, we also check the |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
289 // Invalid characters should fail. | 289 // Invalid characters should fail. |
290 EXPECT_TRUE(WideToCodepage(L"a\xffffz", "iso-8859-1", | 290 EXPECT_TRUE(WideToCodepage(L"a\xffffz", "iso-8859-1", |
291 OnStringConversionError::SKIP, &encoded)); | 291 OnStringConversionError::SKIP, &encoded)); |
292 EXPECT_STREQ("az", encoded.c_str()); | 292 EXPECT_STREQ("az", encoded.c_str()); |
293 | 293 |
294 // Invalid codepages should fail. | 294 // Invalid codepages should fail. |
295 EXPECT_FALSE(WideToCodepage(L"Hello, world", "awesome-8571-2", | 295 EXPECT_FALSE(WideToCodepage(L"Hello, world", "awesome-8571-2", |
296 OnStringConversionError::SKIP, &encoded)); | 296 OnStringConversionError::SKIP, &encoded)); |
297 } | 297 } |
298 | 298 |
299 TEST(StringUtilTest, ConvertBetweenCodepageAndUTF16) { | 299 TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndUTF16) { |
300 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { | 300 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { |
301 string16 utf16; | 301 string16 utf16; |
302 bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded, | 302 bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded, |
303 kConvertCodepageCases[i].codepage_name, | 303 kConvertCodepageCases[i].codepage_name, |
304 kConvertCodepageCases[i].on_error, | 304 kConvertCodepageCases[i].on_error, |
305 &utf16); | 305 &utf16); |
306 string16 utf16_expected; | 306 string16 utf16_expected; |
307 if (kConvertCodepageCases[i].u16_wide == NULL) | 307 if (kConvertCodepageCases[i].u16_wide == NULL) |
308 utf16_expected = BuildString16(kConvertCodepageCases[i].wide); | 308 utf16_expected = BuildString16(kConvertCodepageCases[i].wide); |
309 else | 309 else |
310 utf16_expected = BuildString16(kConvertCodepageCases[i].u16_wide); | 310 utf16_expected = BuildString16(kConvertCodepageCases[i].u16_wide); |
311 EXPECT_EQ(kConvertCodepageCases[i].success, success); | 311 EXPECT_EQ(kConvertCodepageCases[i].success, success); |
312 EXPECT_EQ(utf16_expected, utf16); | 312 EXPECT_EQ(utf16_expected, utf16); |
313 | 313 |
314 // When decoding was successful and nothing was skipped, we also check the | 314 // When decoding was successful and nothing was skipped, we also check the |
315 // reverse conversion. See also the corresponding comment in | 315 // reverse conversion. See also the corresponding comment in |
316 // ConvertBetweenCodepageAndWide. | 316 // ConvertBetweenCodepageAndWide. |
317 if (success && | 317 if (success && |
318 kConvertCodepageCases[i].on_error == OnStringConversionError::FAIL) { | 318 kConvertCodepageCases[i].on_error == OnStringConversionError::FAIL) { |
319 std::string encoded; | 319 std::string encoded; |
320 success = UTF16ToCodepage(utf16, kConvertCodepageCases[i].codepage_name, | 320 success = UTF16ToCodepage(utf16, kConvertCodepageCases[i].codepage_name, |
321 kConvertCodepageCases[i].on_error, &encoded); | 321 kConvertCodepageCases[i].on_error, &encoded); |
322 EXPECT_EQ(kConvertCodepageCases[i].success, success); | 322 EXPECT_EQ(kConvertCodepageCases[i].success, success); |
323 EXPECT_EQ(kConvertCodepageCases[i].encoded, encoded); | 323 EXPECT_EQ(kConvertCodepageCases[i].encoded, encoded); |
324 } | 324 } |
325 } | 325 } |
326 } | 326 } |
327 | 327 |
| 328 static const struct { |
| 329 const char* codepage_name; |
| 330 const char* encoded; |
| 331 size_t input_offset; |
| 332 size_t u16_output_offset; |
| 333 size_t wide_output_offset; |
| 334 } kAdjustOffsetCases[] = { |
| 335 {"gb2312", "", 0, string16::npos, std::wstring::npos}, |
| 336 {"gb2312", "\xC4\xE3\xBA\xC3", 0, 0, 0}, |
| 337 {"gb2312", "\xC4\xE3\xBA\xC3", 2, 1, 1}, |
| 338 {"gb2312", "\xC4\xE3\xBA\xC3", 4, string16::npos, std::wstring::npos}, |
| 339 {"gb2312", "\xC4\xE3\xBA\xC3", 1, string16::npos, std::wstring::npos}, |
| 340 {"gb2312", "\xC4\xE3\xBA\xC3", std::string::npos, string16::npos, |
| 341 std::wstring::npos}, |
| 342 {"gb18030", "\x95\x32\x82\x36\xD2\xBB", 2, string16::npos, |
| 343 std::wstring::npos}, |
| 344 {"gb18030", "\x95\x32\x82\x36\xD2\xBB", 4, 2, 1}, |
| 345 }; |
| 346 |
| 347 TEST(ICUStringConversionsTest, AdjustOffset) { |
| 348 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kAdjustOffsetCases); ++i) { |
| 349 string16 utf16; |
| 350 size_t offset = kAdjustOffsetCases[i].input_offset; |
| 351 EXPECT_TRUE(CodepageToUTF16AndAdjustOffset(kAdjustOffsetCases[i].encoded, |
| 352 kAdjustOffsetCases[i].codepage_name, |
| 353 OnStringConversionError::FAIL, &utf16, &offset)); |
| 354 EXPECT_EQ(kAdjustOffsetCases[i].u16_output_offset, offset); |
| 355 |
| 356 std::wstring wide; |
| 357 offset = kAdjustOffsetCases[i].input_offset; |
| 358 CodepageToWideAndAdjustOffset(kAdjustOffsetCases[i].encoded, |
| 359 kAdjustOffsetCases[i].codepage_name, |
| 360 OnStringConversionError::FAIL, &wide, &offset); |
| 361 #if defined(WCHAR_T_IS_UTF16) |
| 362 EXPECT_EQ(kAdjustOffsetCases[i].u16_output_offset, offset); |
| 363 #elif defined(WCHAR_T_IS_UTF32) |
| 364 EXPECT_EQ(kAdjustOffsetCases[i].wide_output_offset, offset); |
| 365 #endif |
| 366 } |
| 367 } |
| 368 |
328 } // namespace base | 369 } // namespace base |
OLD | NEW |