OLD | NEW |
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "base/i18n/string_conversions.h" | 5 #include "base/i18n/icu_string_conversions.h" |
6 | 6 |
7 #include <vector> | 7 #include <vector> |
8 | 8 |
9 #include "base/basictypes.h" | 9 #include "base/basictypes.h" |
10 #include "base/logging.h" | 10 #include "base/logging.h" |
11 #include "base/string_util.h" | 11 #include "base/string_util.h" |
12 #include "unicode/ucnv.h" | 12 #include "unicode/ucnv.h" |
13 #include "unicode/ucnv_cb.h" | 13 #include "unicode/ucnv_cb.h" |
14 #include "unicode/ucnv_err.h" | 14 #include "unicode/ucnv_err.h" |
15 #include "unicode/ustring.h" | 15 #include "unicode/ustring.h" |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
75 (*(reinterpret_cast<const char*>(context)) == 'i' && | 75 (*(reinterpret_cast<const char*>(context)) == 'i' && |
76 reason == UCNV_UNASSIGNED)) { | 76 reason == UCNV_UNASSIGNED)) { |
77 *err = U_ZERO_ERROR; | 77 *err = U_ZERO_ERROR; |
78 ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err); | 78 ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err); |
79 } | 79 } |
80 // else the caller must have set the error code accordingly. | 80 // else the caller must have set the error code accordingly. |
81 } | 81 } |
82 // else ignore the reset, close and clone calls. | 82 // else ignore the reset, close and clone calls. |
83 } | 83 } |
84 | 84 |
85 // ReadUnicodeCharacter -------------------------------------------------------- | |
86 | |
87 // Reads a UTF-8 stream, placing the next code point into the given output | |
88 // |*code_point|. |src| represents the entire string to read, and |*char_index| | |
89 // is the character offset within the string to start reading at. |*char_index| | |
90 // will be updated to index the last character read, such that incrementing it | |
91 // (as in a for loop) will take the reader to the next character. | |
92 // | |
93 // Returns true on success. On false, |*code_point| will be invalid. | |
94 bool ReadUnicodeCharacter(const char* src, int32 src_len, | |
95 int32* char_index, uint32* code_point_out) { | |
96 // U8_NEXT expects to be able to use -1 to signal an error, so we must | |
97 // use a signed type for code_point. But this function returns false | |
98 // on error anyway, so code_point_out is unsigned. | |
99 int32 code_point; | |
100 U8_NEXT(src, *char_index, src_len, code_point); | |
101 *code_point_out = static_cast<uint32>(code_point); | |
102 | |
103 // The ICU macro above moves to the next char, we want to point to the last | |
104 // char consumed. | |
105 (*char_index)--; | |
106 | |
107 // Validate the decoded value. | |
108 return IsValidCodepoint(code_point); | |
109 } | |
110 | |
111 // Reads a UTF-16 character. The usage is the same as the 8-bit version above. | |
112 bool ReadUnicodeCharacter(const char16* src, int32 src_len, | |
113 int32* char_index, uint32* code_point) { | |
114 if (U16_IS_SURROGATE(src[*char_index])) { | |
115 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || | |
116 *char_index + 1 >= src_len || | |
117 !U16_IS_TRAIL(src[*char_index + 1])) { | |
118 // Invalid surrogate pair. | |
119 return false; | |
120 } | |
121 | |
122 // Valid surrogate pair. | |
123 *code_point = U16_GET_SUPPLEMENTARY(src[*char_index], | |
124 src[*char_index + 1]); | |
125 (*char_index)++; | |
126 } else { | |
127 // Not a surrogate, just one 16-bit word. | |
128 *code_point = src[*char_index]; | |
129 } | |
130 | |
131 return IsValidCodepoint(*code_point); | |
132 } | |
133 | |
134 #if defined(WCHAR_T_IS_UTF32) | |
135 // Reads UTF-32 character. The usage is the same as the 8-bit version above. | |
136 bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, | |
137 int32* char_index, uint32* code_point) { | |
138 // Conversion is easy since the source is 32-bit. | |
139 *code_point = src[*char_index]; | |
140 | |
141 // Validate the value. | |
142 return IsValidCodepoint(*code_point); | |
143 } | |
144 #endif // defined(WCHAR_T_IS_UTF32) | |
145 | |
146 // WriteUnicodeCharacter ------------------------------------------------------- | |
147 | |
148 // Appends a UTF-8 character to the given 8-bit string. | |
149 void WriteUnicodeCharacter(uint32 code_point, std::string* output) { | |
150 if (code_point <= 0x7f) { | |
151 // Fast path the common case of one byte. | |
152 output->push_back(code_point); | |
153 return; | |
154 } | |
155 | |
156 // U8_APPEND_UNSAFE can append up to 4 bytes. | |
157 int32 char_offset = static_cast<int32>(output->length()); | |
158 output->resize(char_offset + U8_MAX_LENGTH); | |
159 | |
160 U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); | |
161 | |
162 // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so | |
163 // it will represent the new length of the string. | |
164 output->resize(char_offset); | |
165 } | |
166 | |
167 // Appends the given code point as a UTF-16 character to the STL string. | |
168 void WriteUnicodeCharacter(uint32 code_point, string16* output) { | |
169 if (U16_LENGTH(code_point) == 1) { | |
170 // Thie code point is in the Basic Multilingual Plane (BMP). | |
171 output->push_back(static_cast<char16>(code_point)); | |
172 } else { | |
173 // Non-BMP characters use a double-character encoding. | |
174 int32 char_offset = static_cast<int32>(output->length()); | |
175 output->resize(char_offset + U16_MAX_LENGTH); | |
176 U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); | |
177 } | |
178 } | |
179 | |
180 #if defined(WCHAR_T_IS_UTF32) | |
181 // Appends the given UTF-32 character to the given 32-bit string. | |
182 inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { | |
183 // This is the easy case, just append the character. | |
184 output->push_back(code_point); | |
185 } | |
186 #endif // defined(WCHAR_T_IS_UTF32) | |
187 | |
188 // Generalized Unicode converter ----------------------------------------------- | |
189 | |
190 // Converts the given source Unicode character type to the given destination | |
191 // Unicode character type as a STL string. The given input buffer and size | |
192 // determine the source, and the given output STL string will be replaced by | |
193 // the result. | |
194 template<typename SRC_CHAR, typename DEST_STRING> | |
195 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { | |
196 output->clear(); | |
197 | |
198 // ICU requires 32-bit numbers. | |
199 bool success = true; | |
200 int32 src_len32 = static_cast<int32>(src_len); | |
201 for (int32 i = 0; i < src_len32; i++) { | |
202 uint32 code_point; | |
203 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { | |
204 WriteUnicodeCharacter(code_point, output); | |
205 } else { | |
206 // TODO(jungshik): consider adding 'Replacement character' (U+FFFD) | |
207 // in place of an invalid codepoint. | |
208 success = false; | |
209 } | |
210 } | |
211 return success; | |
212 } | |
213 | |
214 | |
215 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount | |
216 // of space in the given string. We also assume that the input character types | |
217 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume | |
218 // the string length is greater than zero. | |
219 template<typename CHAR> | |
220 void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) { | |
221 if (src[0] < 0x80) { | |
222 // Assume that the entire input will be ASCII. | |
223 output->reserve(src_len); | |
224 } else { | |
225 // Assume that the entire input is non-ASCII and will have 3 bytes per char. | |
226 output->reserve(src_len * 3); | |
227 } | |
228 } | |
229 | |
230 // Guesses the size of the output buffer (containing either UTF-16 or -32 data) | |
231 // given some UTF-8 input that will be converted to it. See ReserveUTF8Output. | |
232 // We assume the source length is > 0. | |
233 template<typename STRING> | |
234 void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) { | |
235 if (static_cast<unsigned char>(src[0]) < 0x80) { | |
236 // Assume the input is all ASCII, which means 1:1 correspondence. | |
237 output->reserve(src_len); | |
238 } else { | |
239 // Otherwise assume that the UTF-8 sequences will have 2 bytes for each | |
240 // character. | |
241 output->reserve(src_len / 2); | |
242 } | |
243 } | |
244 | |
245 bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src, | 85 bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src, |
246 int uchar_len, OnStringUtilConversionError::Type on_error, | 86 int uchar_len, OnStringUtilConversionError::Type on_error, |
247 std::string* encoded) { | 87 std::string* encoded) { |
248 int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, | 88 int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, |
249 ucnv_getMaxCharSize(converter)); | 89 ucnv_getMaxCharSize(converter)); |
250 encoded->resize(encoded_max_length); | 90 encoded->resize(encoded_max_length); |
251 | 91 |
252 UErrorCode status = U_ZERO_ERROR; | 92 UErrorCode status = U_ZERO_ERROR; |
253 | 93 |
254 // Setup our error handler. | 94 // Setup our error handler. |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
301 inline UConverterType utf32_platform_endian() { | 141 inline UConverterType utf32_platform_endian() { |
302 #if U_IS_BIG_ENDIAN | 142 #if U_IS_BIG_ENDIAN |
303 return UCNV_UTF32_BigEndian; | 143 return UCNV_UTF32_BigEndian; |
304 #else | 144 #else |
305 return UCNV_UTF32_LittleEndian; | 145 return UCNV_UTF32_LittleEndian; |
306 #endif | 146 #endif |
307 } | 147 } |
308 | 148 |
309 } // namespace | 149 } // namespace |
310 | 150 |
311 // UTF-8 <-> Wide -------------------------------------------------------------- | |
312 | |
313 std::string WideToUTF8(const std::wstring& wide) { | |
314 std::string ret; | |
315 if (wide.empty()) | |
316 return ret; | |
317 | |
318 // Ignore the success flag of this call, it will do the best it can for | |
319 // invalid input, which is what we want here. | |
320 WideToUTF8(wide.data(), wide.length(), &ret); | |
321 return ret; | |
322 } | |
323 | |
324 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { | |
325 if (src_len == 0) { | |
326 output->clear(); | |
327 return true; | |
328 } | |
329 | |
330 ReserveUTF8Output(src, src_len, output); | |
331 return ConvertUnicode<wchar_t, std::string>(src, src_len, output); | |
332 } | |
333 | |
334 std::wstring UTF8ToWide(const base::StringPiece& utf8) { | |
335 std::wstring ret; | |
336 if (utf8.empty()) | |
337 return ret; | |
338 | |
339 UTF8ToWide(utf8.data(), utf8.length(), &ret); | |
340 return ret; | |
341 } | |
342 | |
343 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { | |
344 if (src_len == 0) { | |
345 output->clear(); | |
346 return true; | |
347 } | |
348 | |
349 ReserveUTF16Or32Output(src, src_len, output); | |
350 return ConvertUnicode<char, std::wstring>(src, src_len, output); | |
351 } | |
352 | |
353 // UTF-16 <-> Wide ------------------------------------------------------------- | |
354 | |
355 #if defined(WCHAR_T_IS_UTF16) | |
356 | |
357 // When wide == UTF-16, then conversions are a NOP. | |
358 string16 WideToUTF16(const std::wstring& wide) { | |
359 return wide; | |
360 } | |
361 | |
362 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { | |
363 output->assign(src, src_len); | |
364 return true; | |
365 } | |
366 | |
367 std::wstring UTF16ToWide(const string16& utf16) { | |
368 return utf16; | |
369 } | |
370 | |
371 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { | |
372 output->assign(src, src_len); | |
373 return true; | |
374 } | |
375 | |
376 #elif defined(WCHAR_T_IS_UTF32) | |
377 | |
378 string16 WideToUTF16(const std::wstring& wide) { | |
379 string16 ret; | |
380 if (wide.empty()) | |
381 return ret; | |
382 | |
383 WideToUTF16(wide.data(), wide.length(), &ret); | |
384 return ret; | |
385 } | |
386 | |
387 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { | |
388 if (src_len == 0) { | |
389 output->clear(); | |
390 return true; | |
391 } | |
392 | |
393 // Assume that normally we won't have any non-BMP characters so the counts | |
394 // will be the same. | |
395 output->reserve(src_len); | |
396 return ConvertUnicode<wchar_t, string16>(src, src_len, output); | |
397 } | |
398 | |
399 std::wstring UTF16ToWide(const string16& utf16) { | |
400 std::wstring ret; | |
401 if (utf16.empty()) | |
402 return ret; | |
403 | |
404 UTF16ToWide(utf16.data(), utf16.length(), &ret); | |
405 return ret; | |
406 } | |
407 | |
408 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { | |
409 if (src_len == 0) { | |
410 output->clear(); | |
411 return true; | |
412 } | |
413 | |
414 // Assume that normally we won't have any non-BMP characters so the counts | |
415 // will be the same. | |
416 output->reserve(src_len); | |
417 return ConvertUnicode<char16, std::wstring>(src, src_len, output); | |
418 } | |
419 | |
420 #endif // defined(WCHAR_T_IS_UTF32) | |
421 | |
422 // UTF16 <-> UTF8 -------------------------------------------------------------- | |
423 | |
424 #if defined(WCHAR_T_IS_UTF32) | |
425 | |
426 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { | |
427 if (src_len == 0) { | |
428 output->clear(); | |
429 return true; | |
430 } | |
431 | |
432 ReserveUTF16Or32Output(src, src_len, output); | |
433 return ConvertUnicode<char, string16>(src, src_len, output); | |
434 } | |
435 | |
436 string16 UTF8ToUTF16(const std::string& utf8) { | |
437 string16 ret; | |
438 if (utf8.empty()) | |
439 return ret; | |
440 | |
441 // Ignore the success flag of this call, it will do the best it can for | |
442 // invalid input, which is what we want here. | |
443 UTF8ToUTF16(utf8.data(), utf8.length(), &ret); | |
444 return ret; | |
445 } | |
446 | |
447 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { | |
448 if (src_len == 0) { | |
449 output->clear(); | |
450 return true; | |
451 } | |
452 | |
453 ReserveUTF8Output(src, src_len, output); | |
454 return ConvertUnicode<char16, std::string>(src, src_len, output); | |
455 } | |
456 | |
457 std::string UTF16ToUTF8(const string16& utf16) { | |
458 std::string ret; | |
459 if (utf16.empty()) | |
460 return ret; | |
461 | |
462 // Ignore the success flag of this call, it will do the best it can for | |
463 // invalid input, which is what we want here. | |
464 UTF16ToUTF8(utf16.data(), utf16.length(), &ret); | |
465 return ret; | |
466 } | |
467 | |
468 #elif defined(WCHAR_T_IS_UTF16) | |
469 // Easy case since we can use the "wide" versions we already wrote above. | |
470 | |
471 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { | |
472 return UTF8ToWide(src, src_len, output); | |
473 } | |
474 | |
475 string16 UTF8ToUTF16(const std::string& utf8) { | |
476 return UTF8ToWide(utf8); | |
477 } | |
478 | |
479 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { | |
480 return WideToUTF8(src, src_len, output); | |
481 } | |
482 | |
483 std::string UTF16ToUTF8(const string16& utf16) { | |
484 return WideToUTF8(utf16); | |
485 } | |
486 | |
487 #endif | |
488 | |
489 // Codepage <-> Wide/UTF-16 --------------------------------------------------- | 151 // Codepage <-> Wide/UTF-16 --------------------------------------------------- |
490 | 152 |
491 // Convert a wstring into the specified codepage_name. If the codepage | 153 // Convert a wstring into the specified codepage_name. If the codepage |
492 // isn't found, return false. | 154 // isn't found, return false. |
493 bool WideToCodepage(const std::wstring& wide, | 155 bool WideToCodepage(const std::wstring& wide, |
494 const char* codepage_name, | 156 const char* codepage_name, |
495 OnStringUtilConversionError::Type on_error, | 157 OnStringUtilConversionError::Type on_error, |
496 std::string* encoded) { | 158 std::string* encoded) { |
497 #if defined(WCHAR_T_IS_UTF16) | 159 #if defined(WCHAR_T_IS_UTF16) |
498 return UTF16ToCodepage(wide, codepage_name, on_error, encoded); | 160 return UTF16ToCodepage(wide, codepage_name, on_error, encoded); |
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
617 ucnv_close(converter); | 279 ucnv_close(converter); |
618 if (!U_SUCCESS(status)) { | 280 if (!U_SUCCESS(status)) { |
619 utf16->clear(); // Make sure the output is empty on error. | 281 utf16->clear(); // Make sure the output is empty on error. |
620 return false; | 282 return false; |
621 } | 283 } |
622 | 284 |
623 utf16->resize(actual_size); | 285 utf16->resize(actual_size); |
624 return true; | 286 return true; |
625 } | 287 } |
626 | 288 |
OLD | NEW |