| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "base/i18n/string_conversions.h" | |
| 6 | |
| 7 #include <vector> | |
| 8 | |
| 9 #include "base/basictypes.h" | |
| 10 #include "base/logging.h" | |
| 11 #include "base/string_util.h" | |
| 12 #include "unicode/ucnv.h" | |
| 13 #include "unicode/ucnv_cb.h" | |
| 14 #include "unicode/ucnv_err.h" | |
| 15 #include "unicode/ustring.h" | |
| 16 | |
| 17 namespace { | |
| 18 | |
| 19 inline bool IsValidCodepoint(uint32 code_point) { | |
| 20 // Excludes the surrogate code points ([0xD800, 0xDFFF]) and | |
| 21 // codepoints larger than 0x10FFFF (the highest codepoint allowed). | |
| 22 // Non-characters and unassigned codepoints are allowed. | |
| 23 return code_point < 0xD800u || | |
| 24 (code_point >= 0xE000u && code_point <= 0x10FFFFu); | |
| 25 } | |
| 26 | |
| 27 // ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUSBSTITUTE | |
| 28 // in source/common/ucnv_err.c. | |
| 29 | |
| 30 // Copyright (c) 1995-2006 International Business Machines Corporation | |
| 31 // and others | |
| 32 // | |
| 33 // All rights reserved. | |
| 34 // | |
| 35 | |
| 36 // Permission is hereby granted, free of charge, to any person obtaining a | |
| 37 // copy of this software and associated documentation files (the "Software"), | |
| 38 // to deal in the Software without restriction, including without limitation | |
| 39 // the rights to use, copy, modify, merge, publish, distribute, and/or | |
| 40 // sell copies of the Software, and to permit persons to whom the Software | |
| 41 // is furnished to do so, provided that the above copyright notice(s) and | |
| 42 // this permission notice appear in all copies of the Software and that | |
| 43 // both the above copyright notice(s) and this permission notice appear in | |
| 44 // supporting documentation. | |
| 45 // | |
| 46 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
| 47 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
| 48 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT | |
| 49 // OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS | |
| 50 // INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT | |
| 51 // OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS | |
| 52 // OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE | |
| 53 // OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE | |
| 54 // OR PERFORMANCE OF THIS SOFTWARE. | |
| 55 // | |
| 56 // Except as contained in this notice, the name of a copyright holder | |
| 57 // shall not be used in advertising or otherwise to promote the sale, use | |
| 58 // or other dealings in this Software without prior written authorization | |
| 59 // of the copyright holder. | |
| 60 | |
| 61 // ___________________________________________________________________________ | |
| 62 // | |
| 63 // All trademarks and registered trademarks mentioned herein are the property | |
| 64 // of their respective owners. | |
| 65 | |
| 66 void ToUnicodeCallbackSubstitute(const void* context, | |
| 67 UConverterToUnicodeArgs *to_args, | |
| 68 const char* code_units, | |
| 69 int32_t length, | |
| 70 UConverterCallbackReason reason, | |
| 71 UErrorCode * err) { | |
| 72 static const UChar kReplacementChar = 0xFFFD; | |
| 73 if (reason <= UCNV_IRREGULAR) { | |
| 74 if (context == NULL || | |
| 75 (*(reinterpret_cast<const char*>(context)) == 'i' && | |
| 76 reason == UCNV_UNASSIGNED)) { | |
| 77 *err = U_ZERO_ERROR; | |
| 78 ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err); | |
| 79 } | |
| 80 // else the caller must have set the error code accordingly. | |
| 81 } | |
| 82 // else ignore the reset, close and clone calls. | |
| 83 } | |
| 84 | |
| 85 // ReadUnicodeCharacter -------------------------------------------------------- | |
| 86 | |
| 87 // Reads a UTF-8 stream, placing the next code point into the given output | |
| 88 // |*code_point|. |src| represents the entire string to read, and |*char_index| | |
| 89 // is the character offset within the string to start reading at. |*char_index| | |
| 90 // will be updated to index the last character read, such that incrementing it | |
| 91 // (as in a for loop) will take the reader to the next character. | |
| 92 // | |
| 93 // Returns true on success. On false, |*code_point| will be invalid. | |
| 94 bool ReadUnicodeCharacter(const char* src, int32 src_len, | |
| 95 int32* char_index, uint32* code_point_out) { | |
| 96 // U8_NEXT expects to be able to use -1 to signal an error, so we must | |
| 97 // use a signed type for code_point. But this function returns false | |
| 98 // on error anyway, so code_point_out is unsigned. | |
| 99 int32 code_point; | |
| 100 U8_NEXT(src, *char_index, src_len, code_point); | |
| 101 *code_point_out = static_cast<uint32>(code_point); | |
| 102 | |
| 103 // The ICU macro above moves to the next char, we want to point to the last | |
| 104 // char consumed. | |
| 105 (*char_index)--; | |
| 106 | |
| 107 // Validate the decoded value. | |
| 108 return IsValidCodepoint(code_point); | |
| 109 } | |
| 110 | |
| 111 // Reads a UTF-16 character. The usage is the same as the 8-bit version above. | |
| 112 bool ReadUnicodeCharacter(const char16* src, int32 src_len, | |
| 113 int32* char_index, uint32* code_point) { | |
| 114 if (U16_IS_SURROGATE(src[*char_index])) { | |
| 115 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || | |
| 116 *char_index + 1 >= src_len || | |
| 117 !U16_IS_TRAIL(src[*char_index + 1])) { | |
| 118 // Invalid surrogate pair. | |
| 119 return false; | |
| 120 } | |
| 121 | |
| 122 // Valid surrogate pair. | |
| 123 *code_point = U16_GET_SUPPLEMENTARY(src[*char_index], | |
| 124 src[*char_index + 1]); | |
| 125 (*char_index)++; | |
| 126 } else { | |
| 127 // Not a surrogate, just one 16-bit word. | |
| 128 *code_point = src[*char_index]; | |
| 129 } | |
| 130 | |
| 131 return IsValidCodepoint(*code_point); | |
| 132 } | |
| 133 | |
| 134 #if defined(WCHAR_T_IS_UTF32) | |
| 135 // Reads UTF-32 character. The usage is the same as the 8-bit version above. | |
| 136 bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, | |
| 137 int32* char_index, uint32* code_point) { | |
| 138 // Conversion is easy since the source is 32-bit. | |
| 139 *code_point = src[*char_index]; | |
| 140 | |
| 141 // Validate the value. | |
| 142 return IsValidCodepoint(*code_point); | |
| 143 } | |
| 144 #endif // defined(WCHAR_T_IS_UTF32) | |
| 145 | |
| 146 // WriteUnicodeCharacter ------------------------------------------------------- | |
| 147 | |
| 148 // Appends a UTF-8 character to the given 8-bit string. | |
| 149 void WriteUnicodeCharacter(uint32 code_point, std::string* output) { | |
| 150 if (code_point <= 0x7f) { | |
| 151 // Fast path the common case of one byte. | |
| 152 output->push_back(code_point); | |
| 153 return; | |
| 154 } | |
| 155 | |
| 156 // U8_APPEND_UNSAFE can append up to 4 bytes. | |
| 157 int32 char_offset = static_cast<int32>(output->length()); | |
| 158 output->resize(char_offset + U8_MAX_LENGTH); | |
| 159 | |
| 160 U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); | |
| 161 | |
| 162 // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so | |
| 163 // it will represent the new length of the string. | |
| 164 output->resize(char_offset); | |
| 165 } | |
| 166 | |
| 167 // Appends the given code point as a UTF-16 character to the STL string. | |
| 168 void WriteUnicodeCharacter(uint32 code_point, string16* output) { | |
| 169 if (U16_LENGTH(code_point) == 1) { | |
| 170 // Thie code point is in the Basic Multilingual Plane (BMP). | |
| 171 output->push_back(static_cast<char16>(code_point)); | |
| 172 } else { | |
| 173 // Non-BMP characters use a double-character encoding. | |
| 174 int32 char_offset = static_cast<int32>(output->length()); | |
| 175 output->resize(char_offset + U16_MAX_LENGTH); | |
| 176 U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); | |
| 177 } | |
| 178 } | |
| 179 | |
| 180 #if defined(WCHAR_T_IS_UTF32) | |
| 181 // Appends the given UTF-32 character to the given 32-bit string. | |
| 182 inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { | |
| 183 // This is the easy case, just append the character. | |
| 184 output->push_back(code_point); | |
| 185 } | |
| 186 #endif // defined(WCHAR_T_IS_UTF32) | |
| 187 | |
| 188 // Generalized Unicode converter ----------------------------------------------- | |
| 189 | |
| 190 // Converts the given source Unicode character type to the given destination | |
| 191 // Unicode character type as a STL string. The given input buffer and size | |
| 192 // determine the source, and the given output STL string will be replaced by | |
| 193 // the result. | |
| 194 template<typename SRC_CHAR, typename DEST_STRING> | |
| 195 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { | |
| 196 output->clear(); | |
| 197 | |
| 198 // ICU requires 32-bit numbers. | |
| 199 bool success = true; | |
| 200 int32 src_len32 = static_cast<int32>(src_len); | |
| 201 for (int32 i = 0; i < src_len32; i++) { | |
| 202 uint32 code_point; | |
| 203 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { | |
| 204 WriteUnicodeCharacter(code_point, output); | |
| 205 } else { | |
| 206 // TODO(jungshik): consider adding 'Replacement character' (U+FFFD) | |
| 207 // in place of an invalid codepoint. | |
| 208 success = false; | |
| 209 } | |
| 210 } | |
| 211 return success; | |
| 212 } | |
| 213 | |
| 214 | |
| 215 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount | |
| 216 // of space in the given string. We also assume that the input character types | |
| 217 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume | |
| 218 // the string length is greater than zero. | |
| 219 template<typename CHAR> | |
| 220 void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) { | |
| 221 if (src[0] < 0x80) { | |
| 222 // Assume that the entire input will be ASCII. | |
| 223 output->reserve(src_len); | |
| 224 } else { | |
| 225 // Assume that the entire input is non-ASCII and will have 3 bytes per char. | |
| 226 output->reserve(src_len * 3); | |
| 227 } | |
| 228 } | |
| 229 | |
| 230 // Guesses the size of the output buffer (containing either UTF-16 or -32 data) | |
| 231 // given some UTF-8 input that will be converted to it. See ReserveUTF8Output. | |
| 232 // We assume the source length is > 0. | |
| 233 template<typename STRING> | |
| 234 void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) { | |
| 235 if (static_cast<unsigned char>(src[0]) < 0x80) { | |
| 236 // Assume the input is all ASCII, which means 1:1 correspondence. | |
| 237 output->reserve(src_len); | |
| 238 } else { | |
| 239 // Otherwise assume that the UTF-8 sequences will have 2 bytes for each | |
| 240 // character. | |
| 241 output->reserve(src_len / 2); | |
| 242 } | |
| 243 } | |
| 244 | |
| 245 bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src, | |
| 246 int uchar_len, OnStringUtilConversionError::Type on_error, | |
| 247 std::string* encoded) { | |
| 248 int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, | |
| 249 ucnv_getMaxCharSize(converter)); | |
| 250 encoded->resize(encoded_max_length); | |
| 251 | |
| 252 UErrorCode status = U_ZERO_ERROR; | |
| 253 | |
| 254 // Setup our error handler. | |
| 255 switch (on_error) { | |
| 256 case OnStringUtilConversionError::FAIL: | |
| 257 ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0, | |
| 258 NULL, NULL, &status); | |
| 259 break; | |
| 260 case OnStringUtilConversionError::SKIP: | |
| 261 case OnStringUtilConversionError::SUBSTITUTE: | |
| 262 ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0, | |
| 263 NULL, NULL, &status); | |
| 264 break; | |
| 265 default: | |
| 266 NOTREACHED(); | |
| 267 } | |
| 268 | |
| 269 // ucnv_fromUChars returns size not including terminating null | |
| 270 int actual_size = ucnv_fromUChars(converter, &(*encoded)[0], | |
| 271 encoded_max_length, uchar_src, uchar_len, &status); | |
| 272 encoded->resize(actual_size); | |
| 273 ucnv_close(converter); | |
| 274 if (U_SUCCESS(status)) | |
| 275 return true; | |
| 276 encoded->clear(); // Make sure the output is empty on error. | |
| 277 return false; | |
| 278 } | |
| 279 | |
| 280 // Set up our error handler for ToUTF-16 converters | |
| 281 void SetUpErrorHandlerForToUChars(OnStringUtilConversionError::Type on_error, | |
| 282 UConverter* converter, UErrorCode* status) { | |
| 283 switch (on_error) { | |
| 284 case OnStringUtilConversionError::FAIL: | |
| 285 ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0, | |
| 286 NULL, NULL, status); | |
| 287 break; | |
| 288 case OnStringUtilConversionError::SKIP: | |
| 289 ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0, | |
| 290 NULL, NULL, status); | |
| 291 break; | |
| 292 case OnStringUtilConversionError::SUBSTITUTE: | |
| 293 ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, 0, | |
| 294 NULL, NULL, status); | |
| 295 break; | |
| 296 default: | |
| 297 NOTREACHED(); | |
| 298 } | |
| 299 } | |
| 300 | |
| 301 inline UConverterType utf32_platform_endian() { | |
| 302 #if U_IS_BIG_ENDIAN | |
| 303 return UCNV_UTF32_BigEndian; | |
| 304 #else | |
| 305 return UCNV_UTF32_LittleEndian; | |
| 306 #endif | |
| 307 } | |
| 308 | |
| 309 } // namespace | |
| 310 | |
| 311 // UTF-8 <-> Wide -------------------------------------------------------------- | |
| 312 | |
| 313 std::string WideToUTF8(const std::wstring& wide) { | |
| 314 std::string ret; | |
| 315 if (wide.empty()) | |
| 316 return ret; | |
| 317 | |
| 318 // Ignore the success flag of this call, it will do the best it can for | |
| 319 // invalid input, which is what we want here. | |
| 320 WideToUTF8(wide.data(), wide.length(), &ret); | |
| 321 return ret; | |
| 322 } | |
| 323 | |
| 324 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { | |
| 325 if (src_len == 0) { | |
| 326 output->clear(); | |
| 327 return true; | |
| 328 } | |
| 329 | |
| 330 ReserveUTF8Output(src, src_len, output); | |
| 331 return ConvertUnicode<wchar_t, std::string>(src, src_len, output); | |
| 332 } | |
| 333 | |
| 334 std::wstring UTF8ToWide(const base::StringPiece& utf8) { | |
| 335 std::wstring ret; | |
| 336 if (utf8.empty()) | |
| 337 return ret; | |
| 338 | |
| 339 UTF8ToWide(utf8.data(), utf8.length(), &ret); | |
| 340 return ret; | |
| 341 } | |
| 342 | |
| 343 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { | |
| 344 if (src_len == 0) { | |
| 345 output->clear(); | |
| 346 return true; | |
| 347 } | |
| 348 | |
| 349 ReserveUTF16Or32Output(src, src_len, output); | |
| 350 return ConvertUnicode<char, std::wstring>(src, src_len, output); | |
| 351 } | |
| 352 | |
| 353 // UTF-16 <-> Wide ------------------------------------------------------------- | |
| 354 | |
| 355 #if defined(WCHAR_T_IS_UTF16) | |
| 356 | |
| 357 // When wide == UTF-16, then conversions are a NOP. | |
| 358 string16 WideToUTF16(const std::wstring& wide) { | |
| 359 return wide; | |
| 360 } | |
| 361 | |
| 362 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { | |
| 363 output->assign(src, src_len); | |
| 364 return true; | |
| 365 } | |
| 366 | |
| 367 std::wstring UTF16ToWide(const string16& utf16) { | |
| 368 return utf16; | |
| 369 } | |
| 370 | |
| 371 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { | |
| 372 output->assign(src, src_len); | |
| 373 return true; | |
| 374 } | |
| 375 | |
| 376 #elif defined(WCHAR_T_IS_UTF32) | |
| 377 | |
| 378 string16 WideToUTF16(const std::wstring& wide) { | |
| 379 string16 ret; | |
| 380 if (wide.empty()) | |
| 381 return ret; | |
| 382 | |
| 383 WideToUTF16(wide.data(), wide.length(), &ret); | |
| 384 return ret; | |
| 385 } | |
| 386 | |
| 387 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { | |
| 388 if (src_len == 0) { | |
| 389 output->clear(); | |
| 390 return true; | |
| 391 } | |
| 392 | |
| 393 // Assume that normally we won't have any non-BMP characters so the counts | |
| 394 // will be the same. | |
| 395 output->reserve(src_len); | |
| 396 return ConvertUnicode<wchar_t, string16>(src, src_len, output); | |
| 397 } | |
| 398 | |
| 399 std::wstring UTF16ToWide(const string16& utf16) { | |
| 400 std::wstring ret; | |
| 401 if (utf16.empty()) | |
| 402 return ret; | |
| 403 | |
| 404 UTF16ToWide(utf16.data(), utf16.length(), &ret); | |
| 405 return ret; | |
| 406 } | |
| 407 | |
| 408 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { | |
| 409 if (src_len == 0) { | |
| 410 output->clear(); | |
| 411 return true; | |
| 412 } | |
| 413 | |
| 414 // Assume that normally we won't have any non-BMP characters so the counts | |
| 415 // will be the same. | |
| 416 output->reserve(src_len); | |
| 417 return ConvertUnicode<char16, std::wstring>(src, src_len, output); | |
| 418 } | |
| 419 | |
| 420 #endif // defined(WCHAR_T_IS_UTF32) | |
| 421 | |
| 422 // UTF16 <-> UTF8 -------------------------------------------------------------- | |
| 423 | |
| 424 #if defined(WCHAR_T_IS_UTF32) | |
| 425 | |
| 426 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { | |
| 427 if (src_len == 0) { | |
| 428 output->clear(); | |
| 429 return true; | |
| 430 } | |
| 431 | |
| 432 ReserveUTF16Or32Output(src, src_len, output); | |
| 433 return ConvertUnicode<char, string16>(src, src_len, output); | |
| 434 } | |
| 435 | |
| 436 string16 UTF8ToUTF16(const std::string& utf8) { | |
| 437 string16 ret; | |
| 438 if (utf8.empty()) | |
| 439 return ret; | |
| 440 | |
| 441 // Ignore the success flag of this call, it will do the best it can for | |
| 442 // invalid input, which is what we want here. | |
| 443 UTF8ToUTF16(utf8.data(), utf8.length(), &ret); | |
| 444 return ret; | |
| 445 } | |
| 446 | |
| 447 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { | |
| 448 if (src_len == 0) { | |
| 449 output->clear(); | |
| 450 return true; | |
| 451 } | |
| 452 | |
| 453 ReserveUTF8Output(src, src_len, output); | |
| 454 return ConvertUnicode<char16, std::string>(src, src_len, output); | |
| 455 } | |
| 456 | |
| 457 std::string UTF16ToUTF8(const string16& utf16) { | |
| 458 std::string ret; | |
| 459 if (utf16.empty()) | |
| 460 return ret; | |
| 461 | |
| 462 // Ignore the success flag of this call, it will do the best it can for | |
| 463 // invalid input, which is what we want here. | |
| 464 UTF16ToUTF8(utf16.data(), utf16.length(), &ret); | |
| 465 return ret; | |
| 466 } | |
| 467 | |
| 468 #elif defined(WCHAR_T_IS_UTF16) | |
| 469 // Easy case since we can use the "wide" versions we already wrote above. | |
| 470 | |
| 471 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { | |
| 472 return UTF8ToWide(src, src_len, output); | |
| 473 } | |
| 474 | |
| 475 string16 UTF8ToUTF16(const std::string& utf8) { | |
| 476 return UTF8ToWide(utf8); | |
| 477 } | |
| 478 | |
| 479 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { | |
| 480 return WideToUTF8(src, src_len, output); | |
| 481 } | |
| 482 | |
| 483 std::string UTF16ToUTF8(const string16& utf16) { | |
| 484 return WideToUTF8(utf16); | |
| 485 } | |
| 486 | |
| 487 #endif | |
| 488 | |
| 489 // Codepage <-> Wide/UTF-16 --------------------------------------------------- | |
| 490 | |
| 491 // Convert a wstring into the specified codepage_name. If the codepage | |
| 492 // isn't found, return false. | |
| 493 bool WideToCodepage(const std::wstring& wide, | |
| 494 const char* codepage_name, | |
| 495 OnStringUtilConversionError::Type on_error, | |
| 496 std::string* encoded) { | |
| 497 #if defined(WCHAR_T_IS_UTF16) | |
| 498 return UTF16ToCodepage(wide, codepage_name, on_error, encoded); | |
| 499 #elif defined(WCHAR_T_IS_UTF32) | |
| 500 encoded->clear(); | |
| 501 | |
| 502 UErrorCode status = U_ZERO_ERROR; | |
| 503 UConverter* converter = ucnv_open(codepage_name, &status); | |
| 504 if (!U_SUCCESS(status)) | |
| 505 return false; | |
| 506 | |
| 507 int utf16_len; | |
| 508 // When wchar_t is wider than UChar (16 bits), transform |wide| into a | |
| 509 // UChar* string. Size the UChar* buffer to be large enough to hold twice | |
| 510 // as many UTF-16 code units (UChar's) as there are Unicode code points, | |
| 511 // in case each code points translates to a UTF-16 surrogate pair, | |
| 512 // and leave room for a NUL terminator. | |
| 513 std::vector<UChar> utf16(wide.length() * 2 + 1); | |
| 514 u_strFromWCS(&utf16[0], utf16.size(), &utf16_len, | |
| 515 wide.c_str(), wide.length(), &status); | |
| 516 DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*"; | |
| 517 | |
| 518 return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded); | |
| 519 #endif // defined(WCHAR_T_IS_UTF32) | |
| 520 } | |
| 521 | |
| 522 // Convert a UTF-16 string into the specified codepage_name. If the codepage | |
| 523 // isn't found, return false. | |
| 524 bool UTF16ToCodepage(const string16& utf16, | |
| 525 const char* codepage_name, | |
| 526 OnStringUtilConversionError::Type on_error, | |
| 527 std::string* encoded) { | |
| 528 encoded->clear(); | |
| 529 | |
| 530 UErrorCode status = U_ZERO_ERROR; | |
| 531 UConverter* converter = ucnv_open(codepage_name, &status); | |
| 532 if (!U_SUCCESS(status)) | |
| 533 return false; | |
| 534 | |
| 535 return ConvertFromUTF16(converter, utf16.c_str(), | |
| 536 static_cast<int>(utf16.length()), on_error, encoded); | |
| 537 } | |
| 538 | |
| 539 // Converts a string of the given codepage into wstring. | |
| 540 // If the codepage isn't found, return false. | |
| 541 bool CodepageToWide(const std::string& encoded, | |
| 542 const char* codepage_name, | |
| 543 OnStringUtilConversionError::Type on_error, | |
| 544 std::wstring* wide) { | |
| 545 #if defined(WCHAR_T_IS_UTF16) | |
| 546 return CodepageToUTF16(encoded, codepage_name, on_error, wide); | |
| 547 #elif defined(WCHAR_T_IS_UTF32) | |
| 548 wide->clear(); | |
| 549 | |
| 550 UErrorCode status = U_ZERO_ERROR; | |
| 551 UConverter* converter = ucnv_open(codepage_name, &status); | |
| 552 if (!U_SUCCESS(status)) | |
| 553 return false; | |
| 554 | |
| 555 // The maximum length in 4 byte unit of UTF-32 output would be | |
| 556 // at most the same as the number of bytes in input. In the worst | |
| 557 // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP), | |
| 558 // this can be 4 times larger than actually needed. | |
| 559 size_t wchar_max_length = encoded.length() + 1; | |
| 560 | |
| 561 // The byte buffer and its length to pass to ucnv_toAlgorithimic. | |
| 562 char* byte_buffer = reinterpret_cast<char*>( | |
| 563 WriteInto(wide, wchar_max_length)); | |
| 564 int byte_buffer_length = static_cast<int>(wchar_max_length) * 4; | |
| 565 | |
| 566 SetUpErrorHandlerForToUChars(on_error, converter, &status); | |
| 567 int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), | |
| 568 converter, | |
| 569 byte_buffer, | |
| 570 byte_buffer_length, | |
| 571 encoded.data(), | |
| 572 static_cast<int>(encoded.length()), | |
| 573 &status); | |
| 574 ucnv_close(converter); | |
| 575 | |
| 576 if (!U_SUCCESS(status)) { | |
| 577 wide->clear(); // Make sure the output is empty on error. | |
| 578 return false; | |
| 579 } | |
| 580 | |
| 581 // actual_size is # of bytes. | |
| 582 wide->resize(actual_size / 4); | |
| 583 return true; | |
| 584 #endif // defined(WCHAR_T_IS_UTF32) | |
| 585 } | |
| 586 | |
| 587 // Converts a string of the given codepage into UTF-16. | |
| 588 // If the codepage isn't found, return false. | |
| 589 bool CodepageToUTF16(const std::string& encoded, | |
| 590 const char* codepage_name, | |
| 591 OnStringUtilConversionError::Type on_error, | |
| 592 string16* utf16) { | |
| 593 utf16->clear(); | |
| 594 | |
| 595 UErrorCode status = U_ZERO_ERROR; | |
| 596 UConverter* converter = ucnv_open(codepage_name, &status); | |
| 597 if (!U_SUCCESS(status)) | |
| 598 return false; | |
| 599 | |
| 600 // Even in the worst case, the maximum length in 2-byte units of UTF-16 | |
| 601 // output would be at most the same as the number of bytes in input. There | |
| 602 // is no single-byte encoding in which a character is mapped to a | |
| 603 // non-BMP character requiring two 2-byte units. | |
| 604 // | |
| 605 // Moreover, non-BMP characters in legacy multibyte encodings | |
| 606 // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are | |
| 607 // BOCU and SCSU, but we don't care about them. | |
| 608 size_t uchar_max_length = encoded.length() + 1; | |
| 609 | |
| 610 SetUpErrorHandlerForToUChars(on_error, converter, &status); | |
| 611 int actual_size = ucnv_toUChars(converter, | |
| 612 WriteInto(utf16, uchar_max_length), | |
| 613 static_cast<int>(uchar_max_length), | |
| 614 encoded.data(), | |
| 615 static_cast<int>(encoded.length()), | |
| 616 &status); | |
| 617 ucnv_close(converter); | |
| 618 if (!U_SUCCESS(status)) { | |
| 619 utf16->clear(); // Make sure the output is empty on error. | |
| 620 return false; | |
| 621 } | |
| 622 | |
| 623 utf16->resize(actual_size); | |
| 624 return true; | |
| 625 } | |
| 626 | |
| OLD | NEW |