| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include <math.h> | |
| 6 #include <stdarg.h> | |
| 7 | |
| 8 #include <limits> | |
| 9 #include <sstream> | |
| 10 | |
| 11 #include "base/basictypes.h" | |
| 12 #include "base/format_macros.h" | |
| 13 #include "base/i18n/icu_string_conversions.h" | |
| 14 #include "base/logging.h" | |
| 15 #include "base/strings/string_piece.h" | |
| 16 #include "base/strings/stringprintf.h" | |
| 17 #include "base/strings/utf_string_conversions.h" | |
| 18 #include "testing/gtest/include/gtest/gtest.h" | |
| 19 | |
| 20 namespace base { | |
| 21 | |
| 22 namespace { | |
| 23 | |
| 24 // Given a null-terminated string of wchar_t with each wchar_t representing | |
| 25 // a UTF-16 code unit, returns a string16 made up of wchar_t's in the input. | |
| 26 // Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF) | |
| 27 // should be represented as a surrogate pair (two UTF-16 units) | |
| 28 // *even* where wchar_t is 32-bit (Linux and Mac). | |
| 29 // | |
| 30 // This is to help write tests for functions with string16 params until | |
| 31 // the C++ 0x UTF-16 literal is well-supported by compilers. | |
| 32 string16 BuildString16(const wchar_t* s) { | |
| 33 #if defined(WCHAR_T_IS_UTF16) | |
| 34 return string16(s); | |
| 35 #elif defined(WCHAR_T_IS_UTF32) | |
| 36 string16 u16; | |
| 37 while (*s != 0) { | |
| 38 DCHECK_LE(static_cast<unsigned int>(*s), 0xFFFFu); | |
| 39 u16.push_back(*s++); | |
| 40 } | |
| 41 return u16; | |
| 42 #endif | |
| 43 } | |
| 44 | |
| 45 } // namespace | |
| 46 | |
| 47 // kConverterCodepageCases is not comprehensive. There are a number of cases | |
| 48 // to add if we really want to have a comprehensive coverage of various | |
| 49 // codepages and their 'idiosyncrasies'. Currently, the only implementation | |
| 50 // for CodepageTo* and *ToCodepage uses ICU, which has a very extensive | |
| 51 // set of tests for the charset conversion. So, we can get away with a | |
| 52 // relatively small number of cases listed below. | |
| 53 // | |
| 54 // Note about |u16_wide| in the following struct. | |
| 55 // On Windows, the field is always identical to |wide|. On Mac and Linux, | |
| 56 // it's identical as long as there's no character outside the | |
| 57 // BMP (<= U+FFFF). When there is, it is different from |wide| and | |
| 58 // is not a real wide string (UTF-32 string) in that each wchar_t in | |
| 59 // the string is a UTF-16 code unit zero-extended to be 32-bit | |
| 60 // even when the code unit belongs to a surrogate pair. | |
| 61 // For instance, a Unicode string (U+0041 U+010000) is represented as | |
| 62 // L"\x0041\xD800\xDC00" instead of L"\x0041\x10000". | |
| 63 // To avoid the clutter, |u16_wide| will be set to NULL | |
| 64 // if it's identical to |wide| on *all* platforms. | |
| 65 | |
| 66 static const struct { | |
| 67 const char* codepage_name; | |
| 68 const char* encoded; | |
| 69 OnStringConversionError::Type on_error; | |
| 70 bool success; | |
| 71 const wchar_t* wide; | |
| 72 const wchar_t* u16_wide; | |
| 73 } kConvertCodepageCases[] = { | |
| 74 // Test a case where the input cannot be decoded, using SKIP, FAIL | |
| 75 // and SUBSTITUTE error handling rules. "A7 41" is valid, but "A6" isn't. | |
| 76 {"big5", | |
| 77 "\xA7\x41\xA6", | |
| 78 OnStringConversionError::FAIL, | |
| 79 false, | |
| 80 L"", | |
| 81 NULL}, | |
| 82 {"big5", | |
| 83 "\xA7\x41\xA6", | |
| 84 OnStringConversionError::SKIP, | |
| 85 true, | |
| 86 L"\x4F60", | |
| 87 NULL}, | |
| 88 {"big5", | |
| 89 "\xA7\x41\xA6", | |
| 90 OnStringConversionError::SUBSTITUTE, | |
| 91 true, | |
| 92 L"\x4F60\xFFFD", | |
| 93 NULL}, | |
| 94 // Arabic (ISO-8859) | |
| 95 {"iso-8859-6", | |
| 96 "\xC7\xEE\xE4\xD3\xF1\xEE\xE4\xC7\xE5\xEF" " " | |
| 97 "\xD9\xEE\xE4\xEE\xEA\xF2\xE3\xEF\xE5\xF2", | |
| 98 OnStringConversionError::FAIL, | |
| 99 true, | |
| 100 L"\x0627\x064E\x0644\x0633\x0651\x064E\x0644\x0627\x0645\x064F" L" " | |
| 101 L"\x0639\x064E\x0644\x064E\x064A\x0652\x0643\x064F\x0645\x0652", | |
| 102 NULL}, | |
| 103 // Chinese Simplified (GB2312) | |
| 104 {"gb2312", | |
| 105 "\xC4\xE3\xBA\xC3", | |
| 106 OnStringConversionError::FAIL, | |
| 107 true, | |
| 108 L"\x4F60\x597D", | |
| 109 NULL}, | |
| 110 // Chinese (GB18030) : 4 byte sequences mapped to BMP characters | |
| 111 {"gb18030", | |
| 112 "\x81\x30\x84\x36\xA1\xA7", | |
| 113 OnStringConversionError::FAIL, | |
| 114 true, | |
| 115 L"\x00A5\x00A8", | |
| 116 NULL}, | |
| 117 // Chinese (GB18030) : A 4 byte sequence mapped to plane 2 (U+20000) | |
| 118 {"gb18030", | |
| 119 "\x95\x32\x82\x36\xD2\xBB", | |
| 120 OnStringConversionError::FAIL, | |
| 121 true, | |
| 122 #if defined(WCHAR_T_IS_UTF16) | |
| 123 L"\xD840\xDC00\x4E00", | |
| 124 #elif defined(WCHAR_T_IS_UTF32) | |
| 125 L"\x20000\x4E00", | |
| 126 #endif | |
| 127 L"\xD840\xDC00\x4E00"}, | |
| 128 {"big5", | |
| 129 "\xA7\x41\xA6\x6E", | |
| 130 OnStringConversionError::FAIL, | |
| 131 true, | |
| 132 L"\x4F60\x597D", | |
| 133 NULL}, | |
| 134 // Greek (ISO-8859) | |
| 135 {"iso-8859-7", | |
| 136 "\xE3\xE5\xE9\xDC" " " "\xF3\xEF\xF5", | |
| 137 OnStringConversionError::FAIL, | |
| 138 true, | |
| 139 L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5", | |
| 140 NULL}, | |
| 141 // Hebrew (Windows) | |
| 142 {"windows-1255", | |
| 143 "\xF9\xD1\xC8\xEC\xE5\xC9\xED", | |
| 144 OnStringConversionError::FAIL, | |
| 145 true, | |
| 146 L"\x05E9\x05C1\x05B8\x05DC\x05D5\x05B9\x05DD", | |
| 147 NULL}, | |
| 148 // Korean (EUC) | |
| 149 {"euc-kr", | |
| 150 "\xBE\xC8\xB3\xE7\xC7\xCF\xBC\xBC\xBF\xE4", | |
| 151 OnStringConversionError::FAIL, | |
| 152 true, | |
| 153 L"\xC548\xB155\xD558\xC138\xC694", | |
| 154 NULL}, | |
| 155 // Japanese (EUC) | |
| 156 {"euc-jp", | |
| 157 "\xA4\xB3\xA4\xF3\xA4\xCB\xA4\xC1\xA4\xCF\xB0\xEC\x8E\xA6", | |
| 158 OnStringConversionError::FAIL, | |
| 159 true, | |
| 160 L"\x3053\x3093\x306B\x3061\x306F\x4E00\xFF66", | |
| 161 NULL}, | |
| 162 // Japanese (ISO-2022) | |
| 163 {"iso-2022-jp", | |
| 164 "\x1B$B" "\x24\x33\x24\x73\x24\x4B\x24\x41\x24\x4F\x30\x6C" "\x1B(B" | |
| 165 "ab" "\x1B(J" "\x5C\x7E#$" "\x1B(B", | |
| 166 OnStringConversionError::FAIL, | |
| 167 true, | |
| 168 L"\x3053\x3093\x306B\x3061\x306F\x4E00" L"ab\x00A5\x203E#$", | |
| 169 NULL}, | |
| 170 // Japanese (Shift-JIS) | |
| 171 {"sjis", | |
| 172 "\x82\xB1\x82\xF1\x82\xC9\x82\xBF\x82\xCD\x88\xEA\xA6", | |
| 173 OnStringConversionError::FAIL, | |
| 174 true, | |
| 175 L"\x3053\x3093\x306B\x3061\x306F\x4E00\xFF66", | |
| 176 NULL}, | |
| 177 // Russian (KOI8) | |
| 178 {"koi8-r", | |
| 179 "\xDA\xC4\xD2\xC1\xD7\xD3\xD4\xD7\xD5\xCA\xD4\xC5", | |
| 180 OnStringConversionError::FAIL, | |
| 181 true, | |
| 182 L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432" | |
| 183 L"\x0443\x0439\x0442\x0435", | |
| 184 NULL}, | |
| 185 // Thai (windows-874) | |
| 186 {"windows-874", | |
| 187 "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA", | |
| 188 OnStringConversionError::FAIL, | |
| 189 true, | |
| 190 L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35" | |
| 191 L"\x0E04\x0E23\x0e31\x0E1A", | |
| 192 NULL}, | |
| 193 }; | |
| 194 | |
| 195 TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndUTF16) { | |
| 196 for (size_t i = 0; i < arraysize(kConvertCodepageCases); ++i) { | |
| 197 SCOPED_TRACE(base::StringPrintf( | |
| 198 "Test[%" PRIuS "]: <encoded: %s> <codepage: %s>", i, | |
| 199 kConvertCodepageCases[i].encoded, | |
| 200 kConvertCodepageCases[i].codepage_name)); | |
| 201 | |
| 202 string16 utf16; | |
| 203 bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded, | |
| 204 kConvertCodepageCases[i].codepage_name, | |
| 205 kConvertCodepageCases[i].on_error, | |
| 206 &utf16); | |
| 207 string16 utf16_expected; | |
| 208 if (kConvertCodepageCases[i].u16_wide == NULL) | |
| 209 utf16_expected = BuildString16(kConvertCodepageCases[i].wide); | |
| 210 else | |
| 211 utf16_expected = BuildString16(kConvertCodepageCases[i].u16_wide); | |
| 212 EXPECT_EQ(kConvertCodepageCases[i].success, success); | |
| 213 EXPECT_EQ(utf16_expected, utf16); | |
| 214 | |
| 215 // When decoding was successful and nothing was skipped, we also check the | |
| 216 // reverse conversion. See also the corresponding comment in | |
| 217 // ConvertBetweenCodepageAndWide. | |
| 218 if (success && | |
| 219 kConvertCodepageCases[i].on_error == OnStringConversionError::FAIL) { | |
| 220 std::string encoded; | |
| 221 success = UTF16ToCodepage(utf16, kConvertCodepageCases[i].codepage_name, | |
| 222 kConvertCodepageCases[i].on_error, &encoded); | |
| 223 EXPECT_EQ(kConvertCodepageCases[i].success, success); | |
| 224 EXPECT_EQ(kConvertCodepageCases[i].encoded, encoded); | |
| 225 } | |
| 226 } | |
| 227 } | |
| 228 | |
| 229 static const struct { | |
| 230 const char* encoded; | |
| 231 const char* codepage_name; | |
| 232 bool expected_success; | |
| 233 const char* expected_value; | |
| 234 } kConvertAndNormalizeCases[] = { | |
| 235 {"foo-\xe4.html", "iso-8859-1", true, "foo-\xc3\xa4.html"}, | |
| 236 {"foo-\xe4.html", "iso-8859-7", true, "foo-\xce\xb4.html"}, | |
| 237 {"foo-\xe4.html", "foo-bar", false, ""}, | |
| 238 // HTML Encoding spec treats US-ASCII as synonymous with windows-1252 | |
| 239 {"foo-\xff.html", "ascii", true, "foo-\xc3\xbf.html"}, | |
| 240 {"foo.html", "ascii", true, "foo.html"}, | |
| 241 {"foo-a\xcc\x88.html", "utf-8", true, "foo-\xc3\xa4.html"}, | |
| 242 {"\x95\x32\x82\x36\xD2\xBB", "gb18030", true, "\xF0\xA0\x80\x80\xE4\xB8\x80"}, | |
| 243 {"\xA7\x41\xA6\x6E", "big5", true, "\xE4\xBD\xA0\xE5\xA5\xBD"}, | |
| 244 // Windows-1258 does have a combining character at xD2 (which is U+0309). | |
| 245 // The sequence of (U+00E2, U+0309) is also encoded as U+1EA9. | |
| 246 {"foo\xE2\xD2", "windows-1258", true, "foo\xE1\xBA\xA9"}, | |
| 247 {"", "iso-8859-1", true, ""}, | |
| 248 }; | |
| 249 TEST(ICUStringConversionsTest, ConvertToUtf8AndNormalize) { | |
| 250 std::string result; | |
| 251 for (size_t i = 0; i < arraysize(kConvertAndNormalizeCases); ++i) { | |
| 252 SCOPED_TRACE(base::StringPrintf( | |
| 253 "Test[%" PRIuS "]: <encoded: %s> <codepage: %s>", i, | |
| 254 kConvertAndNormalizeCases[i].encoded, | |
| 255 kConvertAndNormalizeCases[i].codepage_name)); | |
| 256 | |
| 257 bool success = ConvertToUtf8AndNormalize( | |
| 258 kConvertAndNormalizeCases[i].encoded, | |
| 259 kConvertAndNormalizeCases[i].codepage_name, &result); | |
| 260 EXPECT_EQ(kConvertAndNormalizeCases[i].expected_success, success); | |
| 261 EXPECT_EQ(kConvertAndNormalizeCases[i].expected_value, result); | |
| 262 } | |
| 263 } | |
| 264 | |
| 265 } // namespace base | |
| OLD | NEW |