| OLD | NEW |
| 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "base/i18n/string_conversions.h" | 5 #include "base/i18n/icu_string_conversions.h" |
| 6 | 6 |
| 7 #include <vector> | 7 #include <vector> |
| 8 | 8 |
| 9 #include "base/basictypes.h" | 9 #include "base/basictypes.h" |
| 10 #include "base/logging.h" | 10 #include "base/logging.h" |
| 11 #include "base/string_util.h" | 11 #include "base/string_util.h" |
| 12 #include "unicode/ucnv.h" | 12 #include "unicode/ucnv.h" |
| 13 #include "unicode/ucnv_cb.h" | 13 #include "unicode/ucnv_cb.h" |
| 14 #include "unicode/ucnv_err.h" | 14 #include "unicode/ucnv_err.h" |
| 15 #include "unicode/ustring.h" | 15 #include "unicode/ustring.h" |
| (...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 75 (*(reinterpret_cast<const char*>(context)) == 'i' && | 75 (*(reinterpret_cast<const char*>(context)) == 'i' && |
| 76 reason == UCNV_UNASSIGNED)) { | 76 reason == UCNV_UNASSIGNED)) { |
| 77 *err = U_ZERO_ERROR; | 77 *err = U_ZERO_ERROR; |
| 78 ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err); | 78 ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err); |
| 79 } | 79 } |
| 80 // else the caller must have set the error code accordingly. | 80 // else the caller must have set the error code accordingly. |
| 81 } | 81 } |
| 82 // else ignore the reset, close and clone calls. | 82 // else ignore the reset, close and clone calls. |
| 83 } | 83 } |
| 84 | 84 |
| 85 // ReadUnicodeCharacter -------------------------------------------------------- | |
| 86 | |
| 87 // Reads a UTF-8 stream, placing the next code point into the given output | |
| 88 // |*code_point|. |src| represents the entire string to read, and |*char_index| | |
| 89 // is the character offset within the string to start reading at. |*char_index| | |
| 90 // will be updated to index the last character read, such that incrementing it | |
| 91 // (as in a for loop) will take the reader to the next character. | |
| 92 // | |
| 93 // Returns true on success. On false, |*code_point| will be invalid. | |
| 94 bool ReadUnicodeCharacter(const char* src, int32 src_len, | |
| 95 int32* char_index, uint32* code_point_out) { | |
| 96 // U8_NEXT expects to be able to use -1 to signal an error, so we must | |
| 97 // use a signed type for code_point. But this function returns false | |
| 98 // on error anyway, so code_point_out is unsigned. | |
| 99 int32 code_point; | |
| 100 U8_NEXT(src, *char_index, src_len, code_point); | |
| 101 *code_point_out = static_cast<uint32>(code_point); | |
| 102 | |
| 103 // The ICU macro above moves to the next char, we want to point to the last | |
| 104 // char consumed. | |
| 105 (*char_index)--; | |
| 106 | |
| 107 // Validate the decoded value. | |
| 108 return IsValidCodepoint(code_point); | |
| 109 } | |
| 110 | |
| 111 // Reads a UTF-16 character. The usage is the same as the 8-bit version above. | |
| 112 bool ReadUnicodeCharacter(const char16* src, int32 src_len, | |
| 113 int32* char_index, uint32* code_point) { | |
| 114 if (U16_IS_SURROGATE(src[*char_index])) { | |
| 115 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || | |
| 116 *char_index + 1 >= src_len || | |
| 117 !U16_IS_TRAIL(src[*char_index + 1])) { | |
| 118 // Invalid surrogate pair. | |
| 119 return false; | |
| 120 } | |
| 121 | |
| 122 // Valid surrogate pair. | |
| 123 *code_point = U16_GET_SUPPLEMENTARY(src[*char_index], | |
| 124 src[*char_index + 1]); | |
| 125 (*char_index)++; | |
| 126 } else { | |
| 127 // Not a surrogate, just one 16-bit word. | |
| 128 *code_point = src[*char_index]; | |
| 129 } | |
| 130 | |
| 131 return IsValidCodepoint(*code_point); | |
| 132 } | |
| 133 | |
| 134 #if defined(WCHAR_T_IS_UTF32) | |
| 135 // Reads UTF-32 character. The usage is the same as the 8-bit version above. | |
| 136 bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, | |
| 137 int32* char_index, uint32* code_point) { | |
| 138 // Conversion is easy since the source is 32-bit. | |
| 139 *code_point = src[*char_index]; | |
| 140 | |
| 141 // Validate the value. | |
| 142 return IsValidCodepoint(*code_point); | |
| 143 } | |
| 144 #endif // defined(WCHAR_T_IS_UTF32) | |
| 145 | |
| 146 // WriteUnicodeCharacter ------------------------------------------------------- | |
| 147 | |
| 148 // Appends a UTF-8 character to the given 8-bit string. | |
| 149 void WriteUnicodeCharacter(uint32 code_point, std::string* output) { | |
| 150 if (code_point <= 0x7f) { | |
| 151 // Fast path the common case of one byte. | |
| 152 output->push_back(code_point); | |
| 153 return; | |
| 154 } | |
| 155 | |
| 156 // U8_APPEND_UNSAFE can append up to 4 bytes. | |
| 157 int32 char_offset = static_cast<int32>(output->length()); | |
| 158 output->resize(char_offset + U8_MAX_LENGTH); | |
| 159 | |
| 160 U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); | |
| 161 | |
| 162 // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so | |
| 163 // it will represent the new length of the string. | |
| 164 output->resize(char_offset); | |
| 165 } | |
| 166 | |
| 167 // Appends the given code point as a UTF-16 character to the STL string. | |
| 168 void WriteUnicodeCharacter(uint32 code_point, string16* output) { | |
| 169 if (U16_LENGTH(code_point) == 1) { | |
| 170 // Thie code point is in the Basic Multilingual Plane (BMP). | |
| 171 output->push_back(static_cast<char16>(code_point)); | |
| 172 } else { | |
| 173 // Non-BMP characters use a double-character encoding. | |
| 174 int32 char_offset = static_cast<int32>(output->length()); | |
| 175 output->resize(char_offset + U16_MAX_LENGTH); | |
| 176 U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); | |
| 177 } | |
| 178 } | |
| 179 | |
| 180 #if defined(WCHAR_T_IS_UTF32) | |
| 181 // Appends the given UTF-32 character to the given 32-bit string. | |
| 182 inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { | |
| 183 // This is the easy case, just append the character. | |
| 184 output->push_back(code_point); | |
| 185 } | |
| 186 #endif // defined(WCHAR_T_IS_UTF32) | |
| 187 | |
| 188 // Generalized Unicode converter ----------------------------------------------- | |
| 189 | |
| 190 // Converts the given source Unicode character type to the given destination | |
| 191 // Unicode character type as a STL string. The given input buffer and size | |
| 192 // determine the source, and the given output STL string will be replaced by | |
| 193 // the result. | |
| 194 template<typename SRC_CHAR, typename DEST_STRING> | |
| 195 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { | |
| 196 output->clear(); | |
| 197 | |
| 198 // ICU requires 32-bit numbers. | |
| 199 bool success = true; | |
| 200 int32 src_len32 = static_cast<int32>(src_len); | |
| 201 for (int32 i = 0; i < src_len32; i++) { | |
| 202 uint32 code_point; | |
| 203 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { | |
| 204 WriteUnicodeCharacter(code_point, output); | |
| 205 } else { | |
| 206 // TODO(jungshik): consider adding 'Replacement character' (U+FFFD) | |
| 207 // in place of an invalid codepoint. | |
| 208 success = false; | |
| 209 } | |
| 210 } | |
| 211 return success; | |
| 212 } | |
| 213 | |
| 214 | |
| 215 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount | |
| 216 // of space in the given string. We also assume that the input character types | |
| 217 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume | |
| 218 // the string length is greater than zero. | |
| 219 template<typename CHAR> | |
| 220 void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) { | |
| 221 if (src[0] < 0x80) { | |
| 222 // Assume that the entire input will be ASCII. | |
| 223 output->reserve(src_len); | |
| 224 } else { | |
| 225 // Assume that the entire input is non-ASCII and will have 3 bytes per char. | |
| 226 output->reserve(src_len * 3); | |
| 227 } | |
| 228 } | |
| 229 | |
| 230 // Guesses the size of the output buffer (containing either UTF-16 or -32 data) | |
| 231 // given some UTF-8 input that will be converted to it. See ReserveUTF8Output. | |
| 232 // We assume the source length is > 0. | |
| 233 template<typename STRING> | |
| 234 void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) { | |
| 235 if (static_cast<unsigned char>(src[0]) < 0x80) { | |
| 236 // Assume the input is all ASCII, which means 1:1 correspondence. | |
| 237 output->reserve(src_len); | |
| 238 } else { | |
| 239 // Otherwise assume that the UTF-8 sequences will have 2 bytes for each | |
| 240 // character. | |
| 241 output->reserve(src_len / 2); | |
| 242 } | |
| 243 } | |
| 244 | |
| 245 bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src, | 85 bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src, |
| 246 int uchar_len, OnStringUtilConversionError::Type on_error, | 86 int uchar_len, OnStringUtilConversionError::Type on_error, |
| 247 std::string* encoded) { | 87 std::string* encoded) { |
| 248 int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, | 88 int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, |
| 249 ucnv_getMaxCharSize(converter)); | 89 ucnv_getMaxCharSize(converter)); |
| 250 encoded->resize(encoded_max_length); | 90 encoded->resize(encoded_max_length); |
| 251 | 91 |
| 252 UErrorCode status = U_ZERO_ERROR; | 92 UErrorCode status = U_ZERO_ERROR; |
| 253 | 93 |
| 254 // Setup our error handler. | 94 // Setup our error handler. |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 301 inline UConverterType utf32_platform_endian() { | 141 inline UConverterType utf32_platform_endian() { |
| 302 #if U_IS_BIG_ENDIAN | 142 #if U_IS_BIG_ENDIAN |
| 303 return UCNV_UTF32_BigEndian; | 143 return UCNV_UTF32_BigEndian; |
| 304 #else | 144 #else |
| 305 return UCNV_UTF32_LittleEndian; | 145 return UCNV_UTF32_LittleEndian; |
| 306 #endif | 146 #endif |
| 307 } | 147 } |
| 308 | 148 |
| 309 } // namespace | 149 } // namespace |
| 310 | 150 |
| 311 // UTF-8 <-> Wide -------------------------------------------------------------- | |
| 312 | |
| 313 std::string WideToUTF8(const std::wstring& wide) { | |
| 314 std::string ret; | |
| 315 if (wide.empty()) | |
| 316 return ret; | |
| 317 | |
| 318 // Ignore the success flag of this call, it will do the best it can for | |
| 319 // invalid input, which is what we want here. | |
| 320 WideToUTF8(wide.data(), wide.length(), &ret); | |
| 321 return ret; | |
| 322 } | |
| 323 | |
| 324 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { | |
| 325 if (src_len == 0) { | |
| 326 output->clear(); | |
| 327 return true; | |
| 328 } | |
| 329 | |
| 330 ReserveUTF8Output(src, src_len, output); | |
| 331 return ConvertUnicode<wchar_t, std::string>(src, src_len, output); | |
| 332 } | |
| 333 | |
| 334 std::wstring UTF8ToWide(const base::StringPiece& utf8) { | |
| 335 std::wstring ret; | |
| 336 if (utf8.empty()) | |
| 337 return ret; | |
| 338 | |
| 339 UTF8ToWide(utf8.data(), utf8.length(), &ret); | |
| 340 return ret; | |
| 341 } | |
| 342 | |
| 343 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { | |
| 344 if (src_len == 0) { | |
| 345 output->clear(); | |
| 346 return true; | |
| 347 } | |
| 348 | |
| 349 ReserveUTF16Or32Output(src, src_len, output); | |
| 350 return ConvertUnicode<char, std::wstring>(src, src_len, output); | |
| 351 } | |
| 352 | |
| 353 // UTF-16 <-> Wide ------------------------------------------------------------- | |
| 354 | |
| 355 #if defined(WCHAR_T_IS_UTF16) | |
| 356 | |
| 357 // When wide == UTF-16, then conversions are a NOP. | |
| 358 string16 WideToUTF16(const std::wstring& wide) { | |
| 359 return wide; | |
| 360 } | |
| 361 | |
| 362 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { | |
| 363 output->assign(src, src_len); | |
| 364 return true; | |
| 365 } | |
| 366 | |
| 367 std::wstring UTF16ToWide(const string16& utf16) { | |
| 368 return utf16; | |
| 369 } | |
| 370 | |
| 371 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { | |
| 372 output->assign(src, src_len); | |
| 373 return true; | |
| 374 } | |
| 375 | |
| 376 #elif defined(WCHAR_T_IS_UTF32) | |
| 377 | |
| 378 string16 WideToUTF16(const std::wstring& wide) { | |
| 379 string16 ret; | |
| 380 if (wide.empty()) | |
| 381 return ret; | |
| 382 | |
| 383 WideToUTF16(wide.data(), wide.length(), &ret); | |
| 384 return ret; | |
| 385 } | |
| 386 | |
| 387 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { | |
| 388 if (src_len == 0) { | |
| 389 output->clear(); | |
| 390 return true; | |
| 391 } | |
| 392 | |
| 393 // Assume that normally we won't have any non-BMP characters so the counts | |
| 394 // will be the same. | |
| 395 output->reserve(src_len); | |
| 396 return ConvertUnicode<wchar_t, string16>(src, src_len, output); | |
| 397 } | |
| 398 | |
| 399 std::wstring UTF16ToWide(const string16& utf16) { | |
| 400 std::wstring ret; | |
| 401 if (utf16.empty()) | |
| 402 return ret; | |
| 403 | |
| 404 UTF16ToWide(utf16.data(), utf16.length(), &ret); | |
| 405 return ret; | |
| 406 } | |
| 407 | |
| 408 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { | |
| 409 if (src_len == 0) { | |
| 410 output->clear(); | |
| 411 return true; | |
| 412 } | |
| 413 | |
| 414 // Assume that normally we won't have any non-BMP characters so the counts | |
| 415 // will be the same. | |
| 416 output->reserve(src_len); | |
| 417 return ConvertUnicode<char16, std::wstring>(src, src_len, output); | |
| 418 } | |
| 419 | |
| 420 #endif // defined(WCHAR_T_IS_UTF32) | |
| 421 | |
| 422 // UTF16 <-> UTF8 -------------------------------------------------------------- | |
| 423 | |
| 424 #if defined(WCHAR_T_IS_UTF32) | |
| 425 | |
| 426 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { | |
| 427 if (src_len == 0) { | |
| 428 output->clear(); | |
| 429 return true; | |
| 430 } | |
| 431 | |
| 432 ReserveUTF16Or32Output(src, src_len, output); | |
| 433 return ConvertUnicode<char, string16>(src, src_len, output); | |
| 434 } | |
| 435 | |
| 436 string16 UTF8ToUTF16(const std::string& utf8) { | |
| 437 string16 ret; | |
| 438 if (utf8.empty()) | |
| 439 return ret; | |
| 440 | |
| 441 // Ignore the success flag of this call, it will do the best it can for | |
| 442 // invalid input, which is what we want here. | |
| 443 UTF8ToUTF16(utf8.data(), utf8.length(), &ret); | |
| 444 return ret; | |
| 445 } | |
| 446 | |
| 447 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { | |
| 448 if (src_len == 0) { | |
| 449 output->clear(); | |
| 450 return true; | |
| 451 } | |
| 452 | |
| 453 ReserveUTF8Output(src, src_len, output); | |
| 454 return ConvertUnicode<char16, std::string>(src, src_len, output); | |
| 455 } | |
| 456 | |
| 457 std::string UTF16ToUTF8(const string16& utf16) { | |
| 458 std::string ret; | |
| 459 if (utf16.empty()) | |
| 460 return ret; | |
| 461 | |
| 462 // Ignore the success flag of this call, it will do the best it can for | |
| 463 // invalid input, which is what we want here. | |
| 464 UTF16ToUTF8(utf16.data(), utf16.length(), &ret); | |
| 465 return ret; | |
| 466 } | |
| 467 | |
| 468 #elif defined(WCHAR_T_IS_UTF16) | |
| 469 // Easy case since we can use the "wide" versions we already wrote above. | |
| 470 | |
| 471 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { | |
| 472 return UTF8ToWide(src, src_len, output); | |
| 473 } | |
| 474 | |
| 475 string16 UTF8ToUTF16(const std::string& utf8) { | |
| 476 return UTF8ToWide(utf8); | |
| 477 } | |
| 478 | |
| 479 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { | |
| 480 return WideToUTF8(src, src_len, output); | |
| 481 } | |
| 482 | |
| 483 std::string UTF16ToUTF8(const string16& utf16) { | |
| 484 return WideToUTF8(utf16); | |
| 485 } | |
| 486 | |
| 487 #endif | |
| 488 | |
| 489 // Codepage <-> Wide/UTF-16 --------------------------------------------------- | 151 // Codepage <-> Wide/UTF-16 --------------------------------------------------- |
| 490 | 152 |
| 491 // Convert a wstring into the specified codepage_name. If the codepage | 153 // Convert a wstring into the specified codepage_name. If the codepage |
| 492 // isn't found, return false. | 154 // isn't found, return false. |
| 493 bool WideToCodepage(const std::wstring& wide, | 155 bool WideToCodepage(const std::wstring& wide, |
| 494 const char* codepage_name, | 156 const char* codepage_name, |
| 495 OnStringUtilConversionError::Type on_error, | 157 OnStringUtilConversionError::Type on_error, |
| 496 std::string* encoded) { | 158 std::string* encoded) { |
| 497 #if defined(WCHAR_T_IS_UTF16) | 159 #if defined(WCHAR_T_IS_UTF16) |
| 498 return UTF16ToCodepage(wide, codepage_name, on_error, encoded); | 160 return UTF16ToCodepage(wide, codepage_name, on_error, encoded); |
| (...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 617 ucnv_close(converter); | 279 ucnv_close(converter); |
| 618 if (!U_SUCCESS(status)) { | 280 if (!U_SUCCESS(status)) { |
| 619 utf16->clear(); // Make sure the output is empty on error. | 281 utf16->clear(); // Make sure the output is empty on error. |
| 620 return false; | 282 return false; |
| 621 } | 283 } |
| 622 | 284 |
| 623 utf16->resize(actual_size); | 285 utf16->resize(actual_size); |
| 624 return true; | 286 return true; |
| 625 } | 287 } |
| 626 | 288 |
| OLD | NEW |