OLD | NEW |
| (Empty) |
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "base/i18n/string_conversions.h" | |
6 | |
7 #include <vector> | |
8 | |
9 #include "base/basictypes.h" | |
10 #include "base/logging.h" | |
11 #include "base/string_util.h" | |
12 #include "unicode/ucnv.h" | |
13 #include "unicode/ucnv_cb.h" | |
14 #include "unicode/ucnv_err.h" | |
15 #include "unicode/ustring.h" | |
16 | |
17 namespace { | |
18 | |
19 inline bool IsValidCodepoint(uint32 code_point) { | |
20 // Excludes the surrogate code points ([0xD800, 0xDFFF]) and | |
21 // codepoints larger than 0x10FFFF (the highest codepoint allowed). | |
22 // Non-characters and unassigned codepoints are allowed. | |
23 return code_point < 0xD800u || | |
24 (code_point >= 0xE000u && code_point <= 0x10FFFFu); | |
25 } | |
26 | |
27 // ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUSBSTITUTE | |
28 // in source/common/ucnv_err.c. | |
29 | |
30 // Copyright (c) 1995-2006 International Business Machines Corporation | |
31 // and others | |
32 // | |
33 // All rights reserved. | |
34 // | |
35 | |
36 // Permission is hereby granted, free of charge, to any person obtaining a | |
37 // copy of this software and associated documentation files (the "Software"), | |
38 // to deal in the Software without restriction, including without limitation | |
39 // the rights to use, copy, modify, merge, publish, distribute, and/or | |
40 // sell copies of the Software, and to permit persons to whom the Software | |
41 // is furnished to do so, provided that the above copyright notice(s) and | |
42 // this permission notice appear in all copies of the Software and that | |
43 // both the above copyright notice(s) and this permission notice appear in | |
44 // supporting documentation. | |
45 // | |
46 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
47 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
48 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT | |
49 // OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS | |
50 // INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT | |
51 // OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS | |
52 // OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE | |
53 // OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE | |
54 // OR PERFORMANCE OF THIS SOFTWARE. | |
55 // | |
56 // Except as contained in this notice, the name of a copyright holder | |
57 // shall not be used in advertising or otherwise to promote the sale, use | |
58 // or other dealings in this Software without prior written authorization | |
59 // of the copyright holder. | |
60 | |
61 // ___________________________________________________________________________ | |
62 // | |
63 // All trademarks and registered trademarks mentioned herein are the property | |
64 // of their respective owners. | |
65 | |
66 void ToUnicodeCallbackSubstitute(const void* context, | |
67 UConverterToUnicodeArgs *to_args, | |
68 const char* code_units, | |
69 int32_t length, | |
70 UConverterCallbackReason reason, | |
71 UErrorCode * err) { | |
72 static const UChar kReplacementChar = 0xFFFD; | |
73 if (reason <= UCNV_IRREGULAR) { | |
74 if (context == NULL || | |
75 (*(reinterpret_cast<const char*>(context)) == 'i' && | |
76 reason == UCNV_UNASSIGNED)) { | |
77 *err = U_ZERO_ERROR; | |
78 ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err); | |
79 } | |
80 // else the caller must have set the error code accordingly. | |
81 } | |
82 // else ignore the reset, close and clone calls. | |
83 } | |
84 | |
85 // ReadUnicodeCharacter -------------------------------------------------------- | |
86 | |
87 // Reads a UTF-8 stream, placing the next code point into the given output | |
88 // |*code_point|. |src| represents the entire string to read, and |*char_index| | |
89 // is the character offset within the string to start reading at. |*char_index| | |
90 // will be updated to index the last character read, such that incrementing it | |
91 // (as in a for loop) will take the reader to the next character. | |
92 // | |
93 // Returns true on success. On false, |*code_point| will be invalid. | |
94 bool ReadUnicodeCharacter(const char* src, int32 src_len, | |
95 int32* char_index, uint32* code_point_out) { | |
96 // U8_NEXT expects to be able to use -1 to signal an error, so we must | |
97 // use a signed type for code_point. But this function returns false | |
98 // on error anyway, so code_point_out is unsigned. | |
99 int32 code_point; | |
100 U8_NEXT(src, *char_index, src_len, code_point); | |
101 *code_point_out = static_cast<uint32>(code_point); | |
102 | |
103 // The ICU macro above moves to the next char, we want to point to the last | |
104 // char consumed. | |
105 (*char_index)--; | |
106 | |
107 // Validate the decoded value. | |
108 return IsValidCodepoint(code_point); | |
109 } | |
110 | |
111 // Reads a UTF-16 character. The usage is the same as the 8-bit version above. | |
112 bool ReadUnicodeCharacter(const char16* src, int32 src_len, | |
113 int32* char_index, uint32* code_point) { | |
114 if (U16_IS_SURROGATE(src[*char_index])) { | |
115 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || | |
116 *char_index + 1 >= src_len || | |
117 !U16_IS_TRAIL(src[*char_index + 1])) { | |
118 // Invalid surrogate pair. | |
119 return false; | |
120 } | |
121 | |
122 // Valid surrogate pair. | |
123 *code_point = U16_GET_SUPPLEMENTARY(src[*char_index], | |
124 src[*char_index + 1]); | |
125 (*char_index)++; | |
126 } else { | |
127 // Not a surrogate, just one 16-bit word. | |
128 *code_point = src[*char_index]; | |
129 } | |
130 | |
131 return IsValidCodepoint(*code_point); | |
132 } | |
133 | |
134 #if defined(WCHAR_T_IS_UTF32) | |
135 // Reads UTF-32 character. The usage is the same as the 8-bit version above. | |
136 bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, | |
137 int32* char_index, uint32* code_point) { | |
138 // Conversion is easy since the source is 32-bit. | |
139 *code_point = src[*char_index]; | |
140 | |
141 // Validate the value. | |
142 return IsValidCodepoint(*code_point); | |
143 } | |
144 #endif // defined(WCHAR_T_IS_UTF32) | |
145 | |
146 // WriteUnicodeCharacter ------------------------------------------------------- | |
147 | |
148 // Appends a UTF-8 character to the given 8-bit string. | |
149 void WriteUnicodeCharacter(uint32 code_point, std::string* output) { | |
150 if (code_point <= 0x7f) { | |
151 // Fast path the common case of one byte. | |
152 output->push_back(code_point); | |
153 return; | |
154 } | |
155 | |
156 // U8_APPEND_UNSAFE can append up to 4 bytes. | |
157 int32 char_offset = static_cast<int32>(output->length()); | |
158 output->resize(char_offset + U8_MAX_LENGTH); | |
159 | |
160 U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); | |
161 | |
162 // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so | |
163 // it will represent the new length of the string. | |
164 output->resize(char_offset); | |
165 } | |
166 | |
167 // Appends the given code point as a UTF-16 character to the STL string. | |
168 void WriteUnicodeCharacter(uint32 code_point, string16* output) { | |
169 if (U16_LENGTH(code_point) == 1) { | |
170 // Thie code point is in the Basic Multilingual Plane (BMP). | |
171 output->push_back(static_cast<char16>(code_point)); | |
172 } else { | |
173 // Non-BMP characters use a double-character encoding. | |
174 int32 char_offset = static_cast<int32>(output->length()); | |
175 output->resize(char_offset + U16_MAX_LENGTH); | |
176 U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); | |
177 } | |
178 } | |
179 | |
180 #if defined(WCHAR_T_IS_UTF32) | |
181 // Appends the given UTF-32 character to the given 32-bit string. | |
182 inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { | |
183 // This is the easy case, just append the character. | |
184 output->push_back(code_point); | |
185 } | |
186 #endif // defined(WCHAR_T_IS_UTF32) | |
187 | |
188 // Generalized Unicode converter ----------------------------------------------- | |
189 | |
190 // Converts the given source Unicode character type to the given destination | |
191 // Unicode character type as a STL string. The given input buffer and size | |
192 // determine the source, and the given output STL string will be replaced by | |
193 // the result. | |
194 template<typename SRC_CHAR, typename DEST_STRING> | |
195 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { | |
196 output->clear(); | |
197 | |
198 // ICU requires 32-bit numbers. | |
199 bool success = true; | |
200 int32 src_len32 = static_cast<int32>(src_len); | |
201 for (int32 i = 0; i < src_len32; i++) { | |
202 uint32 code_point; | |
203 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { | |
204 WriteUnicodeCharacter(code_point, output); | |
205 } else { | |
206 // TODO(jungshik): consider adding 'Replacement character' (U+FFFD) | |
207 // in place of an invalid codepoint. | |
208 success = false; | |
209 } | |
210 } | |
211 return success; | |
212 } | |
213 | |
214 | |
215 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount | |
216 // of space in the given string. We also assume that the input character types | |
217 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume | |
218 // the string length is greater than zero. | |
219 template<typename CHAR> | |
220 void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) { | |
221 if (src[0] < 0x80) { | |
222 // Assume that the entire input will be ASCII. | |
223 output->reserve(src_len); | |
224 } else { | |
225 // Assume that the entire input is non-ASCII and will have 3 bytes per char. | |
226 output->reserve(src_len * 3); | |
227 } | |
228 } | |
229 | |
230 // Guesses the size of the output buffer (containing either UTF-16 or -32 data) | |
231 // given some UTF-8 input that will be converted to it. See ReserveUTF8Output. | |
232 // We assume the source length is > 0. | |
233 template<typename STRING> | |
234 void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) { | |
235 if (static_cast<unsigned char>(src[0]) < 0x80) { | |
236 // Assume the input is all ASCII, which means 1:1 correspondence. | |
237 output->reserve(src_len); | |
238 } else { | |
239 // Otherwise assume that the UTF-8 sequences will have 2 bytes for each | |
240 // character. | |
241 output->reserve(src_len / 2); | |
242 } | |
243 } | |
244 | |
245 bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src, | |
246 int uchar_len, OnStringUtilConversionError::Type on_error, | |
247 std::string* encoded) { | |
248 int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, | |
249 ucnv_getMaxCharSize(converter)); | |
250 encoded->resize(encoded_max_length); | |
251 | |
252 UErrorCode status = U_ZERO_ERROR; | |
253 | |
254 // Setup our error handler. | |
255 switch (on_error) { | |
256 case OnStringUtilConversionError::FAIL: | |
257 ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0, | |
258 NULL, NULL, &status); | |
259 break; | |
260 case OnStringUtilConversionError::SKIP: | |
261 case OnStringUtilConversionError::SUBSTITUTE: | |
262 ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0, | |
263 NULL, NULL, &status); | |
264 break; | |
265 default: | |
266 NOTREACHED(); | |
267 } | |
268 | |
269 // ucnv_fromUChars returns size not including terminating null | |
270 int actual_size = ucnv_fromUChars(converter, &(*encoded)[0], | |
271 encoded_max_length, uchar_src, uchar_len, &status); | |
272 encoded->resize(actual_size); | |
273 ucnv_close(converter); | |
274 if (U_SUCCESS(status)) | |
275 return true; | |
276 encoded->clear(); // Make sure the output is empty on error. | |
277 return false; | |
278 } | |
279 | |
280 // Set up our error handler for ToUTF-16 converters | |
281 void SetUpErrorHandlerForToUChars(OnStringUtilConversionError::Type on_error, | |
282 UConverter* converter, UErrorCode* status) { | |
283 switch (on_error) { | |
284 case OnStringUtilConversionError::FAIL: | |
285 ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0, | |
286 NULL, NULL, status); | |
287 break; | |
288 case OnStringUtilConversionError::SKIP: | |
289 ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0, | |
290 NULL, NULL, status); | |
291 break; | |
292 case OnStringUtilConversionError::SUBSTITUTE: | |
293 ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, 0, | |
294 NULL, NULL, status); | |
295 break; | |
296 default: | |
297 NOTREACHED(); | |
298 } | |
299 } | |
300 | |
301 inline UConverterType utf32_platform_endian() { | |
302 #if U_IS_BIG_ENDIAN | |
303 return UCNV_UTF32_BigEndian; | |
304 #else | |
305 return UCNV_UTF32_LittleEndian; | |
306 #endif | |
307 } | |
308 | |
309 } // namespace | |
310 | |
311 // UTF-8 <-> Wide -------------------------------------------------------------- | |
312 | |
313 std::string WideToUTF8(const std::wstring& wide) { | |
314 std::string ret; | |
315 if (wide.empty()) | |
316 return ret; | |
317 | |
318 // Ignore the success flag of this call, it will do the best it can for | |
319 // invalid input, which is what we want here. | |
320 WideToUTF8(wide.data(), wide.length(), &ret); | |
321 return ret; | |
322 } | |
323 | |
324 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { | |
325 if (src_len == 0) { | |
326 output->clear(); | |
327 return true; | |
328 } | |
329 | |
330 ReserveUTF8Output(src, src_len, output); | |
331 return ConvertUnicode<wchar_t, std::string>(src, src_len, output); | |
332 } | |
333 | |
334 std::wstring UTF8ToWide(const base::StringPiece& utf8) { | |
335 std::wstring ret; | |
336 if (utf8.empty()) | |
337 return ret; | |
338 | |
339 UTF8ToWide(utf8.data(), utf8.length(), &ret); | |
340 return ret; | |
341 } | |
342 | |
343 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { | |
344 if (src_len == 0) { | |
345 output->clear(); | |
346 return true; | |
347 } | |
348 | |
349 ReserveUTF16Or32Output(src, src_len, output); | |
350 return ConvertUnicode<char, std::wstring>(src, src_len, output); | |
351 } | |
352 | |
353 // UTF-16 <-> Wide ------------------------------------------------------------- | |
354 | |
355 #if defined(WCHAR_T_IS_UTF16) | |
356 | |
357 // When wide == UTF-16, then conversions are a NOP. | |
358 string16 WideToUTF16(const std::wstring& wide) { | |
359 return wide; | |
360 } | |
361 | |
362 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { | |
363 output->assign(src, src_len); | |
364 return true; | |
365 } | |
366 | |
367 std::wstring UTF16ToWide(const string16& utf16) { | |
368 return utf16; | |
369 } | |
370 | |
371 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { | |
372 output->assign(src, src_len); | |
373 return true; | |
374 } | |
375 | |
376 #elif defined(WCHAR_T_IS_UTF32) | |
377 | |
378 string16 WideToUTF16(const std::wstring& wide) { | |
379 string16 ret; | |
380 if (wide.empty()) | |
381 return ret; | |
382 | |
383 WideToUTF16(wide.data(), wide.length(), &ret); | |
384 return ret; | |
385 } | |
386 | |
387 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { | |
388 if (src_len == 0) { | |
389 output->clear(); | |
390 return true; | |
391 } | |
392 | |
393 // Assume that normally we won't have any non-BMP characters so the counts | |
394 // will be the same. | |
395 output->reserve(src_len); | |
396 return ConvertUnicode<wchar_t, string16>(src, src_len, output); | |
397 } | |
398 | |
399 std::wstring UTF16ToWide(const string16& utf16) { | |
400 std::wstring ret; | |
401 if (utf16.empty()) | |
402 return ret; | |
403 | |
404 UTF16ToWide(utf16.data(), utf16.length(), &ret); | |
405 return ret; | |
406 } | |
407 | |
408 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { | |
409 if (src_len == 0) { | |
410 output->clear(); | |
411 return true; | |
412 } | |
413 | |
414 // Assume that normally we won't have any non-BMP characters so the counts | |
415 // will be the same. | |
416 output->reserve(src_len); | |
417 return ConvertUnicode<char16, std::wstring>(src, src_len, output); | |
418 } | |
419 | |
420 #endif // defined(WCHAR_T_IS_UTF32) | |
421 | |
422 // UTF16 <-> UTF8 -------------------------------------------------------------- | |
423 | |
424 #if defined(WCHAR_T_IS_UTF32) | |
425 | |
426 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { | |
427 if (src_len == 0) { | |
428 output->clear(); | |
429 return true; | |
430 } | |
431 | |
432 ReserveUTF16Or32Output(src, src_len, output); | |
433 return ConvertUnicode<char, string16>(src, src_len, output); | |
434 } | |
435 | |
436 string16 UTF8ToUTF16(const std::string& utf8) { | |
437 string16 ret; | |
438 if (utf8.empty()) | |
439 return ret; | |
440 | |
441 // Ignore the success flag of this call, it will do the best it can for | |
442 // invalid input, which is what we want here. | |
443 UTF8ToUTF16(utf8.data(), utf8.length(), &ret); | |
444 return ret; | |
445 } | |
446 | |
447 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { | |
448 if (src_len == 0) { | |
449 output->clear(); | |
450 return true; | |
451 } | |
452 | |
453 ReserveUTF8Output(src, src_len, output); | |
454 return ConvertUnicode<char16, std::string>(src, src_len, output); | |
455 } | |
456 | |
457 std::string UTF16ToUTF8(const string16& utf16) { | |
458 std::string ret; | |
459 if (utf16.empty()) | |
460 return ret; | |
461 | |
462 // Ignore the success flag of this call, it will do the best it can for | |
463 // invalid input, which is what we want here. | |
464 UTF16ToUTF8(utf16.data(), utf16.length(), &ret); | |
465 return ret; | |
466 } | |
467 | |
468 #elif defined(WCHAR_T_IS_UTF16) | |
469 // Easy case since we can use the "wide" versions we already wrote above. | |
470 | |
471 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { | |
472 return UTF8ToWide(src, src_len, output); | |
473 } | |
474 | |
475 string16 UTF8ToUTF16(const std::string& utf8) { | |
476 return UTF8ToWide(utf8); | |
477 } | |
478 | |
479 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { | |
480 return WideToUTF8(src, src_len, output); | |
481 } | |
482 | |
483 std::string UTF16ToUTF8(const string16& utf16) { | |
484 return WideToUTF8(utf16); | |
485 } | |
486 | |
487 #endif | |
488 | |
489 // Codepage <-> Wide/UTF-16 --------------------------------------------------- | |
490 | |
491 // Convert a wstring into the specified codepage_name. If the codepage | |
492 // isn't found, return false. | |
493 bool WideToCodepage(const std::wstring& wide, | |
494 const char* codepage_name, | |
495 OnStringUtilConversionError::Type on_error, | |
496 std::string* encoded) { | |
497 #if defined(WCHAR_T_IS_UTF16) | |
498 return UTF16ToCodepage(wide, codepage_name, on_error, encoded); | |
499 #elif defined(WCHAR_T_IS_UTF32) | |
500 encoded->clear(); | |
501 | |
502 UErrorCode status = U_ZERO_ERROR; | |
503 UConverter* converter = ucnv_open(codepage_name, &status); | |
504 if (!U_SUCCESS(status)) | |
505 return false; | |
506 | |
507 int utf16_len; | |
508 // When wchar_t is wider than UChar (16 bits), transform |wide| into a | |
509 // UChar* string. Size the UChar* buffer to be large enough to hold twice | |
510 // as many UTF-16 code units (UChar's) as there are Unicode code points, | |
511 // in case each code points translates to a UTF-16 surrogate pair, | |
512 // and leave room for a NUL terminator. | |
513 std::vector<UChar> utf16(wide.length() * 2 + 1); | |
514 u_strFromWCS(&utf16[0], utf16.size(), &utf16_len, | |
515 wide.c_str(), wide.length(), &status); | |
516 DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*"; | |
517 | |
518 return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded); | |
519 #endif // defined(WCHAR_T_IS_UTF32) | |
520 } | |
521 | |
522 // Convert a UTF-16 string into the specified codepage_name. If the codepage | |
523 // isn't found, return false. | |
524 bool UTF16ToCodepage(const string16& utf16, | |
525 const char* codepage_name, | |
526 OnStringUtilConversionError::Type on_error, | |
527 std::string* encoded) { | |
528 encoded->clear(); | |
529 | |
530 UErrorCode status = U_ZERO_ERROR; | |
531 UConverter* converter = ucnv_open(codepage_name, &status); | |
532 if (!U_SUCCESS(status)) | |
533 return false; | |
534 | |
535 return ConvertFromUTF16(converter, utf16.c_str(), | |
536 static_cast<int>(utf16.length()), on_error, encoded); | |
537 } | |
538 | |
539 // Converts a string of the given codepage into wstring. | |
540 // If the codepage isn't found, return false. | |
541 bool CodepageToWide(const std::string& encoded, | |
542 const char* codepage_name, | |
543 OnStringUtilConversionError::Type on_error, | |
544 std::wstring* wide) { | |
545 #if defined(WCHAR_T_IS_UTF16) | |
546 return CodepageToUTF16(encoded, codepage_name, on_error, wide); | |
547 #elif defined(WCHAR_T_IS_UTF32) | |
548 wide->clear(); | |
549 | |
550 UErrorCode status = U_ZERO_ERROR; | |
551 UConverter* converter = ucnv_open(codepage_name, &status); | |
552 if (!U_SUCCESS(status)) | |
553 return false; | |
554 | |
555 // The maximum length in 4 byte unit of UTF-32 output would be | |
556 // at most the same as the number of bytes in input. In the worst | |
557 // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP), | |
558 // this can be 4 times larger than actually needed. | |
559 size_t wchar_max_length = encoded.length() + 1; | |
560 | |
561 // The byte buffer and its length to pass to ucnv_toAlgorithimic. | |
562 char* byte_buffer = reinterpret_cast<char*>( | |
563 WriteInto(wide, wchar_max_length)); | |
564 int byte_buffer_length = static_cast<int>(wchar_max_length) * 4; | |
565 | |
566 SetUpErrorHandlerForToUChars(on_error, converter, &status); | |
567 int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), | |
568 converter, | |
569 byte_buffer, | |
570 byte_buffer_length, | |
571 encoded.data(), | |
572 static_cast<int>(encoded.length()), | |
573 &status); | |
574 ucnv_close(converter); | |
575 | |
576 if (!U_SUCCESS(status)) { | |
577 wide->clear(); // Make sure the output is empty on error. | |
578 return false; | |
579 } | |
580 | |
581 // actual_size is # of bytes. | |
582 wide->resize(actual_size / 4); | |
583 return true; | |
584 #endif // defined(WCHAR_T_IS_UTF32) | |
585 } | |
586 | |
587 // Converts a string of the given codepage into UTF-16. | |
588 // If the codepage isn't found, return false. | |
589 bool CodepageToUTF16(const std::string& encoded, | |
590 const char* codepage_name, | |
591 OnStringUtilConversionError::Type on_error, | |
592 string16* utf16) { | |
593 utf16->clear(); | |
594 | |
595 UErrorCode status = U_ZERO_ERROR; | |
596 UConverter* converter = ucnv_open(codepage_name, &status); | |
597 if (!U_SUCCESS(status)) | |
598 return false; | |
599 | |
600 // Even in the worst case, the maximum length in 2-byte units of UTF-16 | |
601 // output would be at most the same as the number of bytes in input. There | |
602 // is no single-byte encoding in which a character is mapped to a | |
603 // non-BMP character requiring two 2-byte units. | |
604 // | |
605 // Moreover, non-BMP characters in legacy multibyte encodings | |
606 // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are | |
607 // BOCU and SCSU, but we don't care about them. | |
608 size_t uchar_max_length = encoded.length() + 1; | |
609 | |
610 SetUpErrorHandlerForToUChars(on_error, converter, &status); | |
611 int actual_size = ucnv_toUChars(converter, | |
612 WriteInto(utf16, uchar_max_length), | |
613 static_cast<int>(uchar_max_length), | |
614 encoded.data(), | |
615 static_cast<int>(encoded.length()), | |
616 &status); | |
617 ucnv_close(converter); | |
618 if (!U_SUCCESS(status)) { | |
619 utf16->clear(); // Make sure the output is empty on error. | |
620 return false; | |
621 } | |
622 | |
623 utf16->resize(actual_size); | |
624 return true; | |
625 } | |
626 | |
OLD | NEW |