OLD | NEW |
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "base/i18n/string_conversions.h" | 5 #include "base/utf_string_conversions.h" |
6 | 6 |
7 #include <vector> | 7 #include <vector> |
8 | 8 |
9 #include "base/basictypes.h" | 9 #include "base/basictypes.h" |
10 #include "base/logging.h" | 10 #include "base/logging.h" |
11 #include "base/string_util.h" | 11 #include "base/string_util.h" |
12 #include "unicode/ucnv.h" | 12 #include "base/third_party/icu/icu_utf.h" |
13 #include "unicode/ucnv_cb.h" | |
14 #include "unicode/ucnv_err.h" | |
15 #include "unicode/ustring.h" | |
16 | 13 |
17 namespace { | 14 namespace { |
18 | 15 |
19 inline bool IsValidCodepoint(uint32 code_point) { | 16 inline bool IsValidCodepoint(uint32 code_point) { |
20 // Excludes the surrogate code points ([0xD800, 0xDFFF]) and | 17 // Excludes the surrogate code points ([0xD800, 0xDFFF]) and |
21 // codepoints larger than 0x10FFFF (the highest codepoint allowed). | 18 // codepoints larger than 0x10FFFF (the highest codepoint allowed). |
22 // Non-characters and unassigned codepoints are allowed. | 19 // Non-characters and unassigned codepoints are allowed. |
23 return code_point < 0xD800u || | 20 return code_point < 0xD800u || |
24 (code_point >= 0xE000u && code_point <= 0x10FFFFu); | 21 (code_point >= 0xE000u && code_point <= 0x10FFFFu); |
25 } | 22 } |
26 | 23 |
27 // ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUSBSTITUTE | |
28 // in source/common/ucnv_err.c. | |
29 | |
30 // Copyright (c) 1995-2006 International Business Machines Corporation | |
31 // and others | |
32 // | |
33 // All rights reserved. | |
34 // | |
35 | |
36 // Permission is hereby granted, free of charge, to any person obtaining a | |
37 // copy of this software and associated documentation files (the "Software"), | |
38 // to deal in the Software without restriction, including without limitation | |
39 // the rights to use, copy, modify, merge, publish, distribute, and/or | |
40 // sell copies of the Software, and to permit persons to whom the Software | |
41 // is furnished to do so, provided that the above copyright notice(s) and | |
42 // this permission notice appear in all copies of the Software and that | |
43 // both the above copyright notice(s) and this permission notice appear in | |
44 // supporting documentation. | |
45 // | |
46 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
47 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
48 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT | |
49 // OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS | |
50 // INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT | |
51 // OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS | |
52 // OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE | |
53 // OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE | |
54 // OR PERFORMANCE OF THIS SOFTWARE. | |
55 // | |
56 // Except as contained in this notice, the name of a copyright holder | |
57 // shall not be used in advertising or otherwise to promote the sale, use | |
58 // or other dealings in this Software without prior written authorization | |
59 // of the copyright holder. | |
60 | |
61 // ___________________________________________________________________________ | |
62 // | |
63 // All trademarks and registered trademarks mentioned herein are the property | |
64 // of their respective owners. | |
65 | |
66 void ToUnicodeCallbackSubstitute(const void* context, | |
67 UConverterToUnicodeArgs *to_args, | |
68 const char* code_units, | |
69 int32_t length, | |
70 UConverterCallbackReason reason, | |
71 UErrorCode * err) { | |
72 static const UChar kReplacementChar = 0xFFFD; | |
73 if (reason <= UCNV_IRREGULAR) { | |
74 if (context == NULL || | |
75 (*(reinterpret_cast<const char*>(context)) == 'i' && | |
76 reason == UCNV_UNASSIGNED)) { | |
77 *err = U_ZERO_ERROR; | |
78 ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err); | |
79 } | |
80 // else the caller must have set the error code accordingly. | |
81 } | |
82 // else ignore the reset, close and clone calls. | |
83 } | |
84 | |
85 // ReadUnicodeCharacter -------------------------------------------------------- | 24 // ReadUnicodeCharacter -------------------------------------------------------- |
86 | 25 |
87 // Reads a UTF-8 stream, placing the next code point into the given output | 26 // Reads a UTF-8 stream, placing the next code point into the given output |
88 // |*code_point|. |src| represents the entire string to read, and |*char_index| | 27 // |*code_point|. |src| represents the entire string to read, and |*char_index| |
89 // is the character offset within the string to start reading at. |*char_index| | 28 // is the character offset within the string to start reading at. |*char_index| |
90 // will be updated to index the last character read, such that incrementing it | 29 // will be updated to index the last character read, such that incrementing it |
91 // (as in a for loop) will take the reader to the next character. | 30 // (as in a for loop) will take the reader to the next character. |
92 // | 31 // |
93 // Returns true on success. On false, |*code_point| will be invalid. | 32 // Returns true on success. On false, |*code_point| will be invalid. |
94 bool ReadUnicodeCharacter(const char* src, int32 src_len, | 33 bool ReadUnicodeCharacter(const char* src, int32 src_len, |
95 int32* char_index, uint32* code_point_out) { | 34 int32* char_index, uint32* code_point_out) { |
96 // U8_NEXT expects to be able to use -1 to signal an error, so we must | 35 // U8_NEXT expects to be able to use -1 to signal an error, so we must |
97 // use a signed type for code_point. But this function returns false | 36 // use a signed type for code_point. But this function returns false |
98 // on error anyway, so code_point_out is unsigned. | 37 // on error anyway, so code_point_out is unsigned. |
99 int32 code_point; | 38 int32 code_point; |
100 U8_NEXT(src, *char_index, src_len, code_point); | 39 CBU8_NEXT(src, *char_index, src_len, code_point); |
101 *code_point_out = static_cast<uint32>(code_point); | 40 *code_point_out = static_cast<uint32>(code_point); |
102 | 41 |
103 // The ICU macro above moves to the next char, we want to point to the last | 42 // The ICU macro above moves to the next char, we want to point to the last |
104 // char consumed. | 43 // char consumed. |
105 (*char_index)--; | 44 (*char_index)--; |
106 | 45 |
107 // Validate the decoded value. | 46 // Validate the decoded value. |
108 return IsValidCodepoint(code_point); | 47 return IsValidCodepoint(code_point); |
109 } | 48 } |
110 | 49 |
111 // Reads a UTF-16 character. The usage is the same as the 8-bit version above. | 50 // Reads a UTF-16 character. The usage is the same as the 8-bit version above. |
112 bool ReadUnicodeCharacter(const char16* src, int32 src_len, | 51 bool ReadUnicodeCharacter(const char16* src, int32 src_len, |
113 int32* char_index, uint32* code_point) { | 52 int32* char_index, uint32* code_point) { |
114 if (U16_IS_SURROGATE(src[*char_index])) { | 53 if (CBU16_IS_SURROGATE(src[*char_index])) { |
115 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || | 54 if (!CBU16_IS_SURROGATE_LEAD(src[*char_index]) || |
116 *char_index + 1 >= src_len || | 55 *char_index + 1 >= src_len || |
117 !U16_IS_TRAIL(src[*char_index + 1])) { | 56 !CBU16_IS_TRAIL(src[*char_index + 1])) { |
118 // Invalid surrogate pair. | 57 // Invalid surrogate pair. |
119 return false; | 58 return false; |
120 } | 59 } |
121 | 60 |
122 // Valid surrogate pair. | 61 // Valid surrogate pair. |
123 *code_point = U16_GET_SUPPLEMENTARY(src[*char_index], | 62 *code_point = CBU16_GET_SUPPLEMENTARY(src[*char_index], |
124 src[*char_index + 1]); | 63 src[*char_index + 1]); |
125 (*char_index)++; | 64 (*char_index)++; |
126 } else { | 65 } else { |
127 // Not a surrogate, just one 16-bit word. | 66 // Not a surrogate, just one 16-bit word. |
128 *code_point = src[*char_index]; | 67 *code_point = src[*char_index]; |
129 } | 68 } |
130 | 69 |
131 return IsValidCodepoint(*code_point); | 70 return IsValidCodepoint(*code_point); |
132 } | 71 } |
133 | 72 |
134 #if defined(WCHAR_T_IS_UTF32) | 73 #if defined(WCHAR_T_IS_UTF32) |
(...skipping 13 matching lines...) Expand all Loading... |
148 // Appends a UTF-8 character to the given 8-bit string. | 87 // Appends a UTF-8 character to the given 8-bit string. |
149 void WriteUnicodeCharacter(uint32 code_point, std::string* output) { | 88 void WriteUnicodeCharacter(uint32 code_point, std::string* output) { |
150 if (code_point <= 0x7f) { | 89 if (code_point <= 0x7f) { |
151 // Fast path the common case of one byte. | 90 // Fast path the common case of one byte. |
152 output->push_back(code_point); | 91 output->push_back(code_point); |
153 return; | 92 return; |
154 } | 93 } |
155 | 94 |
156 // U8_APPEND_UNSAFE can append up to 4 bytes. | 95 // U8_APPEND_UNSAFE can append up to 4 bytes. |
157 int32 char_offset = static_cast<int32>(output->length()); | 96 int32 char_offset = static_cast<int32>(output->length()); |
158 output->resize(char_offset + U8_MAX_LENGTH); | 97 output->resize(char_offset + CBU8_MAX_LENGTH); |
159 | 98 |
160 U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); | 99 CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); |
161 | 100 |
162 // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so | 101 // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so |
163 // it will represent the new length of the string. | 102 // it will represent the new length of the string. |
164 output->resize(char_offset); | 103 output->resize(char_offset); |
165 } | 104 } |
166 | 105 |
167 // Appends the given code point as a UTF-16 character to the STL string. | 106 // Appends the given code point as a UTF-16 character to the STL string. |
168 void WriteUnicodeCharacter(uint32 code_point, string16* output) { | 107 void WriteUnicodeCharacter(uint32 code_point, string16* output) { |
169 if (U16_LENGTH(code_point) == 1) { | 108 if (CBU16_LENGTH(code_point) == 1) { |
170 // Thie code point is in the Basic Multilingual Plane (BMP). | 109 // Thie code point is in the Basic Multilingual Plane (BMP). |
171 output->push_back(static_cast<char16>(code_point)); | 110 output->push_back(static_cast<char16>(code_point)); |
172 } else { | 111 } else { |
173 // Non-BMP characters use a double-character encoding. | 112 // Non-BMP characters use a double-character encoding. |
174 int32 char_offset = static_cast<int32>(output->length()); | 113 int32 char_offset = static_cast<int32>(output->length()); |
175 output->resize(char_offset + U16_MAX_LENGTH); | 114 output->resize(char_offset + CBU16_MAX_LENGTH); |
176 U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); | 115 CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); |
177 } | 116 } |
178 } | 117 } |
179 | 118 |
180 #if defined(WCHAR_T_IS_UTF32) | 119 #if defined(WCHAR_T_IS_UTF32) |
181 // Appends the given UTF-32 character to the given 32-bit string. | 120 // Appends the given UTF-32 character to the given 32-bit string. |
182 inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { | 121 inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { |
183 // This is the easy case, just append the character. | 122 // This is the easy case, just append the character. |
184 output->push_back(code_point); | 123 output->push_back(code_point); |
185 } | 124 } |
186 #endif // defined(WCHAR_T_IS_UTF32) | 125 #endif // defined(WCHAR_T_IS_UTF32) |
(...skipping 17 matching lines...) Expand all Loading... |
204 WriteUnicodeCharacter(code_point, output); | 143 WriteUnicodeCharacter(code_point, output); |
205 } else { | 144 } else { |
206 // TODO(jungshik): consider adding 'Replacement character' (U+FFFD) | 145 // TODO(jungshik): consider adding 'Replacement character' (U+FFFD) |
207 // in place of an invalid codepoint. | 146 // in place of an invalid codepoint. |
208 success = false; | 147 success = false; |
209 } | 148 } |
210 } | 149 } |
211 return success; | 150 return success; |
212 } | 151 } |
213 | 152 |
214 | |
215 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount | 153 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount |
216 // of space in the given string. We also assume that the input character types | 154 // of space in the given string. We also assume that the input character types |
217 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume | 155 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume |
218 // the string length is greater than zero. | 156 // the string length is greater than zero. |
219 template<typename CHAR> | 157 template<typename CHAR> |
220 void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) { | 158 void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) { |
221 if (src[0] < 0x80) { | 159 if (src[0] < 0x80) { |
222 // Assume that the entire input will be ASCII. | 160 // Assume that the entire input will be ASCII. |
223 output->reserve(src_len); | 161 output->reserve(src_len); |
224 } else { | 162 } else { |
(...skipping 10 matching lines...) Expand all Loading... |
235 if (static_cast<unsigned char>(src[0]) < 0x80) { | 173 if (static_cast<unsigned char>(src[0]) < 0x80) { |
236 // Assume the input is all ASCII, which means 1:1 correspondence. | 174 // Assume the input is all ASCII, which means 1:1 correspondence. |
237 output->reserve(src_len); | 175 output->reserve(src_len); |
238 } else { | 176 } else { |
239 // Otherwise assume that the UTF-8 sequences will have 2 bytes for each | 177 // Otherwise assume that the UTF-8 sequences will have 2 bytes for each |
240 // character. | 178 // character. |
241 output->reserve(src_len / 2); | 179 output->reserve(src_len / 2); |
242 } | 180 } |
243 } | 181 } |
244 | 182 |
245 bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src, | |
246 int uchar_len, OnStringUtilConversionError::Type on_error, | |
247 std::string* encoded) { | |
248 int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, | |
249 ucnv_getMaxCharSize(converter)); | |
250 encoded->resize(encoded_max_length); | |
251 | |
252 UErrorCode status = U_ZERO_ERROR; | |
253 | |
254 // Setup our error handler. | |
255 switch (on_error) { | |
256 case OnStringUtilConversionError::FAIL: | |
257 ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0, | |
258 NULL, NULL, &status); | |
259 break; | |
260 case OnStringUtilConversionError::SKIP: | |
261 case OnStringUtilConversionError::SUBSTITUTE: | |
262 ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0, | |
263 NULL, NULL, &status); | |
264 break; | |
265 default: | |
266 NOTREACHED(); | |
267 } | |
268 | |
269 // ucnv_fromUChars returns size not including terminating null | |
270 int actual_size = ucnv_fromUChars(converter, &(*encoded)[0], | |
271 encoded_max_length, uchar_src, uchar_len, &status); | |
272 encoded->resize(actual_size); | |
273 ucnv_close(converter); | |
274 if (U_SUCCESS(status)) | |
275 return true; | |
276 encoded->clear(); // Make sure the output is empty on error. | |
277 return false; | |
278 } | |
279 | |
280 // Set up our error handler for ToUTF-16 converters | |
281 void SetUpErrorHandlerForToUChars(OnStringUtilConversionError::Type on_error, | |
282 UConverter* converter, UErrorCode* status) { | |
283 switch (on_error) { | |
284 case OnStringUtilConversionError::FAIL: | |
285 ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0, | |
286 NULL, NULL, status); | |
287 break; | |
288 case OnStringUtilConversionError::SKIP: | |
289 ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0, | |
290 NULL, NULL, status); | |
291 break; | |
292 case OnStringUtilConversionError::SUBSTITUTE: | |
293 ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, 0, | |
294 NULL, NULL, status); | |
295 break; | |
296 default: | |
297 NOTREACHED(); | |
298 } | |
299 } | |
300 | |
301 inline UConverterType utf32_platform_endian() { | |
302 #if U_IS_BIG_ENDIAN | |
303 return UCNV_UTF32_BigEndian; | |
304 #else | |
305 return UCNV_UTF32_LittleEndian; | |
306 #endif | |
307 } | |
308 | |
309 } // namespace | 183 } // namespace |
310 | 184 |
311 // UTF-8 <-> Wide -------------------------------------------------------------- | 185 // UTF-8 <-> Wide -------------------------------------------------------------- |
312 | 186 |
313 std::string WideToUTF8(const std::wstring& wide) { | 187 std::string WideToUTF8(const std::wstring& wide) { |
314 std::string ret; | 188 std::string ret; |
315 if (wide.empty()) | 189 if (wide.empty()) |
316 return ret; | 190 return ret; |
317 | 191 |
318 // Ignore the success flag of this call, it will do the best it can for | 192 // Ignore the success flag of this call, it will do the best it can for |
(...skipping 159 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
478 | 352 |
479 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { | 353 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { |
480 return WideToUTF8(src, src_len, output); | 354 return WideToUTF8(src, src_len, output); |
481 } | 355 } |
482 | 356 |
483 std::string UTF16ToUTF8(const string16& utf16) { | 357 std::string UTF16ToUTF8(const string16& utf16) { |
484 return WideToUTF8(utf16); | 358 return WideToUTF8(utf16); |
485 } | 359 } |
486 | 360 |
487 #endif | 361 #endif |
488 | |
489 // Codepage <-> Wide/UTF-16 --------------------------------------------------- | |
490 | |
491 // Convert a wstring into the specified codepage_name. If the codepage | |
492 // isn't found, return false. | |
493 bool WideToCodepage(const std::wstring& wide, | |
494 const char* codepage_name, | |
495 OnStringUtilConversionError::Type on_error, | |
496 std::string* encoded) { | |
497 #if defined(WCHAR_T_IS_UTF16) | |
498 return UTF16ToCodepage(wide, codepage_name, on_error, encoded); | |
499 #elif defined(WCHAR_T_IS_UTF32) | |
500 encoded->clear(); | |
501 | |
502 UErrorCode status = U_ZERO_ERROR; | |
503 UConverter* converter = ucnv_open(codepage_name, &status); | |
504 if (!U_SUCCESS(status)) | |
505 return false; | |
506 | |
507 int utf16_len; | |
508 // When wchar_t is wider than UChar (16 bits), transform |wide| into a | |
509 // UChar* string. Size the UChar* buffer to be large enough to hold twice | |
510 // as many UTF-16 code units (UChar's) as there are Unicode code points, | |
511 // in case each code points translates to a UTF-16 surrogate pair, | |
512 // and leave room for a NUL terminator. | |
513 std::vector<UChar> utf16(wide.length() * 2 + 1); | |
514 u_strFromWCS(&utf16[0], utf16.size(), &utf16_len, | |
515 wide.c_str(), wide.length(), &status); | |
516 DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*"; | |
517 | |
518 return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded); | |
519 #endif // defined(WCHAR_T_IS_UTF32) | |
520 } | |
521 | |
522 // Convert a UTF-16 string into the specified codepage_name. If the codepage | |
523 // isn't found, return false. | |
524 bool UTF16ToCodepage(const string16& utf16, | |
525 const char* codepage_name, | |
526 OnStringUtilConversionError::Type on_error, | |
527 std::string* encoded) { | |
528 encoded->clear(); | |
529 | |
530 UErrorCode status = U_ZERO_ERROR; | |
531 UConverter* converter = ucnv_open(codepage_name, &status); | |
532 if (!U_SUCCESS(status)) | |
533 return false; | |
534 | |
535 return ConvertFromUTF16(converter, utf16.c_str(), | |
536 static_cast<int>(utf16.length()), on_error, encoded); | |
537 } | |
538 | |
539 // Converts a string of the given codepage into wstring. | |
540 // If the codepage isn't found, return false. | |
541 bool CodepageToWide(const std::string& encoded, | |
542 const char* codepage_name, | |
543 OnStringUtilConversionError::Type on_error, | |
544 std::wstring* wide) { | |
545 #if defined(WCHAR_T_IS_UTF16) | |
546 return CodepageToUTF16(encoded, codepage_name, on_error, wide); | |
547 #elif defined(WCHAR_T_IS_UTF32) | |
548 wide->clear(); | |
549 | |
550 UErrorCode status = U_ZERO_ERROR; | |
551 UConverter* converter = ucnv_open(codepage_name, &status); | |
552 if (!U_SUCCESS(status)) | |
553 return false; | |
554 | |
555 // The maximum length in 4 byte unit of UTF-32 output would be | |
556 // at most the same as the number of bytes in input. In the worst | |
557 // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP), | |
558 // this can be 4 times larger than actually needed. | |
559 size_t wchar_max_length = encoded.length() + 1; | |
560 | |
561 // The byte buffer and its length to pass to ucnv_toAlgorithimic. | |
562 char* byte_buffer = reinterpret_cast<char*>( | |
563 WriteInto(wide, wchar_max_length)); | |
564 int byte_buffer_length = static_cast<int>(wchar_max_length) * 4; | |
565 | |
566 SetUpErrorHandlerForToUChars(on_error, converter, &status); | |
567 int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), | |
568 converter, | |
569 byte_buffer, | |
570 byte_buffer_length, | |
571 encoded.data(), | |
572 static_cast<int>(encoded.length()), | |
573 &status); | |
574 ucnv_close(converter); | |
575 | |
576 if (!U_SUCCESS(status)) { | |
577 wide->clear(); // Make sure the output is empty on error. | |
578 return false; | |
579 } | |
580 | |
581 // actual_size is # of bytes. | |
582 wide->resize(actual_size / 4); | |
583 return true; | |
584 #endif // defined(WCHAR_T_IS_UTF32) | |
585 } | |
586 | |
587 // Converts a string of the given codepage into UTF-16. | |
588 // If the codepage isn't found, return false. | |
589 bool CodepageToUTF16(const std::string& encoded, | |
590 const char* codepage_name, | |
591 OnStringUtilConversionError::Type on_error, | |
592 string16* utf16) { | |
593 utf16->clear(); | |
594 | |
595 UErrorCode status = U_ZERO_ERROR; | |
596 UConverter* converter = ucnv_open(codepage_name, &status); | |
597 if (!U_SUCCESS(status)) | |
598 return false; | |
599 | |
600 // Even in the worst case, the maximum length in 2-byte units of UTF-16 | |
601 // output would be at most the same as the number of bytes in input. There | |
602 // is no single-byte encoding in which a character is mapped to a | |
603 // non-BMP character requiring two 2-byte units. | |
604 // | |
605 // Moreover, non-BMP characters in legacy multibyte encodings | |
606 // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are | |
607 // BOCU and SCSU, but we don't care about them. | |
608 size_t uchar_max_length = encoded.length() + 1; | |
609 | |
610 SetUpErrorHandlerForToUChars(on_error, converter, &status); | |
611 int actual_size = ucnv_toUChars(converter, | |
612 WriteInto(utf16, uchar_max_length), | |
613 static_cast<int>(uchar_max_length), | |
614 encoded.data(), | |
615 static_cast<int>(encoded.length()), | |
616 &status); | |
617 ucnv_close(converter); | |
618 if (!U_SUCCESS(status)) { | |
619 utf16->clear(); // Make sure the output is empty on error. | |
620 return false; | |
621 } | |
622 | |
623 utf16->resize(actual_size); | |
624 return true; | |
625 } | |
626 | |
OLD | NEW |