Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1279)

Side by Side Diff: base/i18n/icu_string_conversions.cc

Issue 243102: Convert base dependencies to use sys_string_conversions instead of the ICU... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 11 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « base/i18n/icu_string_conversions.h ('k') | base/i18n/string_conversions.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Property Changes:
Added: svn:mergeinfo
Merged /branches/chrome_webkit_merge_branch/base/i18n/string_conversions.cc:r69-2775
OLDNEW
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "base/i18n/string_conversions.h" 5 #include "base/i18n/icu_string_conversions.h"
6 6
7 #include <vector> 7 #include <vector>
8 8
9 #include "base/basictypes.h" 9 #include "base/basictypes.h"
10 #include "base/logging.h" 10 #include "base/logging.h"
11 #include "base/string_util.h" 11 #include "base/string_util.h"
12 #include "unicode/ucnv.h" 12 #include "unicode/ucnv.h"
13 #include "unicode/ucnv_cb.h" 13 #include "unicode/ucnv_cb.h"
14 #include "unicode/ucnv_err.h" 14 #include "unicode/ucnv_err.h"
15 #include "unicode/ustring.h" 15 #include "unicode/ustring.h"
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
75 (*(reinterpret_cast<const char*>(context)) == 'i' && 75 (*(reinterpret_cast<const char*>(context)) == 'i' &&
76 reason == UCNV_UNASSIGNED)) { 76 reason == UCNV_UNASSIGNED)) {
77 *err = U_ZERO_ERROR; 77 *err = U_ZERO_ERROR;
78 ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err); 78 ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err);
79 } 79 }
80 // else the caller must have set the error code accordingly. 80 // else the caller must have set the error code accordingly.
81 } 81 }
82 // else ignore the reset, close and clone calls. 82 // else ignore the reset, close and clone calls.
83 } 83 }
84 84
85 // ReadUnicodeCharacter --------------------------------------------------------
86
87 // Reads a UTF-8 stream, placing the next code point into the given output
88 // |*code_point|. |src| represents the entire string to read, and |*char_index|
89 // is the character offset within the string to start reading at. |*char_index|
90 // will be updated to index the last character read, such that incrementing it
91 // (as in a for loop) will take the reader to the next character.
92 //
93 // Returns true on success. On false, |*code_point| will be invalid.
94 bool ReadUnicodeCharacter(const char* src, int32 src_len,
95 int32* char_index, uint32* code_point_out) {
96 // U8_NEXT expects to be able to use -1 to signal an error, so we must
97 // use a signed type for code_point. But this function returns false
98 // on error anyway, so code_point_out is unsigned.
99 int32 code_point;
100 U8_NEXT(src, *char_index, src_len, code_point);
101 *code_point_out = static_cast<uint32>(code_point);
102
103 // The ICU macro above moves to the next char, we want to point to the last
104 // char consumed.
105 (*char_index)--;
106
107 // Validate the decoded value.
108 return IsValidCodepoint(code_point);
109 }
110
111 // Reads a UTF-16 character. The usage is the same as the 8-bit version above.
112 bool ReadUnicodeCharacter(const char16* src, int32 src_len,
113 int32* char_index, uint32* code_point) {
114 if (U16_IS_SURROGATE(src[*char_index])) {
115 if (!U16_IS_SURROGATE_LEAD(src[*char_index]) ||
116 *char_index + 1 >= src_len ||
117 !U16_IS_TRAIL(src[*char_index + 1])) {
118 // Invalid surrogate pair.
119 return false;
120 }
121
122 // Valid surrogate pair.
123 *code_point = U16_GET_SUPPLEMENTARY(src[*char_index],
124 src[*char_index + 1]);
125 (*char_index)++;
126 } else {
127 // Not a surrogate, just one 16-bit word.
128 *code_point = src[*char_index];
129 }
130
131 return IsValidCodepoint(*code_point);
132 }
133
134 #if defined(WCHAR_T_IS_UTF32)
135 // Reads UTF-32 character. The usage is the same as the 8-bit version above.
136 bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
137 int32* char_index, uint32* code_point) {
138 // Conversion is easy since the source is 32-bit.
139 *code_point = src[*char_index];
140
141 // Validate the value.
142 return IsValidCodepoint(*code_point);
143 }
144 #endif // defined(WCHAR_T_IS_UTF32)
145
146 // WriteUnicodeCharacter -------------------------------------------------------
147
148 // Appends a UTF-8 character to the given 8-bit string.
149 void WriteUnicodeCharacter(uint32 code_point, std::string* output) {
150 if (code_point <= 0x7f) {
151 // Fast path the common case of one byte.
152 output->push_back(code_point);
153 return;
154 }
155
156 // U8_APPEND_UNSAFE can append up to 4 bytes.
157 int32 char_offset = static_cast<int32>(output->length());
158 output->resize(char_offset + U8_MAX_LENGTH);
159
160 U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
161
162 // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so
163 // it will represent the new length of the string.
164 output->resize(char_offset);
165 }
166
167 // Appends the given code point as a UTF-16 character to the STL string.
168 void WriteUnicodeCharacter(uint32 code_point, string16* output) {
169 if (U16_LENGTH(code_point) == 1) {
170 // Thie code point is in the Basic Multilingual Plane (BMP).
171 output->push_back(static_cast<char16>(code_point));
172 } else {
173 // Non-BMP characters use a double-character encoding.
174 int32 char_offset = static_cast<int32>(output->length());
175 output->resize(char_offset + U16_MAX_LENGTH);
176 U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
177 }
178 }
179
180 #if defined(WCHAR_T_IS_UTF32)
181 // Appends the given UTF-32 character to the given 32-bit string.
182 inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) {
183 // This is the easy case, just append the character.
184 output->push_back(code_point);
185 }
186 #endif // defined(WCHAR_T_IS_UTF32)
187
188 // Generalized Unicode converter -----------------------------------------------
189
190 // Converts the given source Unicode character type to the given destination
191 // Unicode character type as a STL string. The given input buffer and size
192 // determine the source, and the given output STL string will be replaced by
193 // the result.
194 template<typename SRC_CHAR, typename DEST_STRING>
195 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) {
196 output->clear();
197
198 // ICU requires 32-bit numbers.
199 bool success = true;
200 int32 src_len32 = static_cast<int32>(src_len);
201 for (int32 i = 0; i < src_len32; i++) {
202 uint32 code_point;
203 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
204 WriteUnicodeCharacter(code_point, output);
205 } else {
206 // TODO(jungshik): consider adding 'Replacement character' (U+FFFD)
207 // in place of an invalid codepoint.
208 success = false;
209 }
210 }
211 return success;
212 }
213
214
215 // Guesses the length of the output in UTF-8 in bytes, and reserves that amount
216 // of space in the given string. We also assume that the input character types
217 // are unsigned, which will be true for UTF-16 and -32 on our systems. We assume
218 // the string length is greater than zero.
219 template<typename CHAR>
220 void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) {
221 if (src[0] < 0x80) {
222 // Assume that the entire input will be ASCII.
223 output->reserve(src_len);
224 } else {
225 // Assume that the entire input is non-ASCII and will have 3 bytes per char.
226 output->reserve(src_len * 3);
227 }
228 }
229
230 // Guesses the size of the output buffer (containing either UTF-16 or -32 data)
231 // given some UTF-8 input that will be converted to it. See ReserveUTF8Output.
232 // We assume the source length is > 0.
233 template<typename STRING>
234 void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) {
235 if (static_cast<unsigned char>(src[0]) < 0x80) {
236 // Assume the input is all ASCII, which means 1:1 correspondence.
237 output->reserve(src_len);
238 } else {
239 // Otherwise assume that the UTF-8 sequences will have 2 bytes for each
240 // character.
241 output->reserve(src_len / 2);
242 }
243 }
244
245 bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src, 85 bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src,
246 int uchar_len, OnStringUtilConversionError::Type on_error, 86 int uchar_len, OnStringUtilConversionError::Type on_error,
247 std::string* encoded) { 87 std::string* encoded) {
248 int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, 88 int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len,
249 ucnv_getMaxCharSize(converter)); 89 ucnv_getMaxCharSize(converter));
250 encoded->resize(encoded_max_length); 90 encoded->resize(encoded_max_length);
251 91
252 UErrorCode status = U_ZERO_ERROR; 92 UErrorCode status = U_ZERO_ERROR;
253 93
254 // Setup our error handler. 94 // Setup our error handler.
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
301 inline UConverterType utf32_platform_endian() { 141 inline UConverterType utf32_platform_endian() {
302 #if U_IS_BIG_ENDIAN 142 #if U_IS_BIG_ENDIAN
303 return UCNV_UTF32_BigEndian; 143 return UCNV_UTF32_BigEndian;
304 #else 144 #else
305 return UCNV_UTF32_LittleEndian; 145 return UCNV_UTF32_LittleEndian;
306 #endif 146 #endif
307 } 147 }
308 148
309 } // namespace 149 } // namespace
310 150
311 // UTF-8 <-> Wide --------------------------------------------------------------
312
313 std::string WideToUTF8(const std::wstring& wide) {
314 std::string ret;
315 if (wide.empty())
316 return ret;
317
318 // Ignore the success flag of this call, it will do the best it can for
319 // invalid input, which is what we want here.
320 WideToUTF8(wide.data(), wide.length(), &ret);
321 return ret;
322 }
323
324 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
325 if (src_len == 0) {
326 output->clear();
327 return true;
328 }
329
330 ReserveUTF8Output(src, src_len, output);
331 return ConvertUnicode<wchar_t, std::string>(src, src_len, output);
332 }
333
334 std::wstring UTF8ToWide(const base::StringPiece& utf8) {
335 std::wstring ret;
336 if (utf8.empty())
337 return ret;
338
339 UTF8ToWide(utf8.data(), utf8.length(), &ret);
340 return ret;
341 }
342
343 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
344 if (src_len == 0) {
345 output->clear();
346 return true;
347 }
348
349 ReserveUTF16Or32Output(src, src_len, output);
350 return ConvertUnicode<char, std::wstring>(src, src_len, output);
351 }
352
353 // UTF-16 <-> Wide -------------------------------------------------------------
354
355 #if defined(WCHAR_T_IS_UTF16)
356
357 // When wide == UTF-16, then conversions are a NOP.
358 string16 WideToUTF16(const std::wstring& wide) {
359 return wide;
360 }
361
362 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
363 output->assign(src, src_len);
364 return true;
365 }
366
367 std::wstring UTF16ToWide(const string16& utf16) {
368 return utf16;
369 }
370
371 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
372 output->assign(src, src_len);
373 return true;
374 }
375
376 #elif defined(WCHAR_T_IS_UTF32)
377
378 string16 WideToUTF16(const std::wstring& wide) {
379 string16 ret;
380 if (wide.empty())
381 return ret;
382
383 WideToUTF16(wide.data(), wide.length(), &ret);
384 return ret;
385 }
386
387 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
388 if (src_len == 0) {
389 output->clear();
390 return true;
391 }
392
393 // Assume that normally we won't have any non-BMP characters so the counts
394 // will be the same.
395 output->reserve(src_len);
396 return ConvertUnicode<wchar_t, string16>(src, src_len, output);
397 }
398
399 std::wstring UTF16ToWide(const string16& utf16) {
400 std::wstring ret;
401 if (utf16.empty())
402 return ret;
403
404 UTF16ToWide(utf16.data(), utf16.length(), &ret);
405 return ret;
406 }
407
408 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
409 if (src_len == 0) {
410 output->clear();
411 return true;
412 }
413
414 // Assume that normally we won't have any non-BMP characters so the counts
415 // will be the same.
416 output->reserve(src_len);
417 return ConvertUnicode<char16, std::wstring>(src, src_len, output);
418 }
419
420 #endif // defined(WCHAR_T_IS_UTF32)
421
422 // UTF16 <-> UTF8 --------------------------------------------------------------
423
424 #if defined(WCHAR_T_IS_UTF32)
425
426 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
427 if (src_len == 0) {
428 output->clear();
429 return true;
430 }
431
432 ReserveUTF16Or32Output(src, src_len, output);
433 return ConvertUnicode<char, string16>(src, src_len, output);
434 }
435
436 string16 UTF8ToUTF16(const std::string& utf8) {
437 string16 ret;
438 if (utf8.empty())
439 return ret;
440
441 // Ignore the success flag of this call, it will do the best it can for
442 // invalid input, which is what we want here.
443 UTF8ToUTF16(utf8.data(), utf8.length(), &ret);
444 return ret;
445 }
446
447 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
448 if (src_len == 0) {
449 output->clear();
450 return true;
451 }
452
453 ReserveUTF8Output(src, src_len, output);
454 return ConvertUnicode<char16, std::string>(src, src_len, output);
455 }
456
457 std::string UTF16ToUTF8(const string16& utf16) {
458 std::string ret;
459 if (utf16.empty())
460 return ret;
461
462 // Ignore the success flag of this call, it will do the best it can for
463 // invalid input, which is what we want here.
464 UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
465 return ret;
466 }
467
468 #elif defined(WCHAR_T_IS_UTF16)
469 // Easy case since we can use the "wide" versions we already wrote above.
470
471 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
472 return UTF8ToWide(src, src_len, output);
473 }
474
475 string16 UTF8ToUTF16(const std::string& utf8) {
476 return UTF8ToWide(utf8);
477 }
478
479 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
480 return WideToUTF8(src, src_len, output);
481 }
482
483 std::string UTF16ToUTF8(const string16& utf16) {
484 return WideToUTF8(utf16);
485 }
486
487 #endif
488
489 // Codepage <-> Wide/UTF-16 --------------------------------------------------- 151 // Codepage <-> Wide/UTF-16 ---------------------------------------------------
490 152
491 // Convert a wstring into the specified codepage_name. If the codepage 153 // Convert a wstring into the specified codepage_name. If the codepage
492 // isn't found, return false. 154 // isn't found, return false.
493 bool WideToCodepage(const std::wstring& wide, 155 bool WideToCodepage(const std::wstring& wide,
494 const char* codepage_name, 156 const char* codepage_name,
495 OnStringUtilConversionError::Type on_error, 157 OnStringUtilConversionError::Type on_error,
496 std::string* encoded) { 158 std::string* encoded) {
497 #if defined(WCHAR_T_IS_UTF16) 159 #if defined(WCHAR_T_IS_UTF16)
498 return UTF16ToCodepage(wide, codepage_name, on_error, encoded); 160 return UTF16ToCodepage(wide, codepage_name, on_error, encoded);
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after
617 ucnv_close(converter); 279 ucnv_close(converter);
618 if (!U_SUCCESS(status)) { 280 if (!U_SUCCESS(status)) {
619 utf16->clear(); // Make sure the output is empty on error. 281 utf16->clear(); // Make sure the output is empty on error.
620 return false; 282 return false;
621 } 283 }
622 284
623 utf16->resize(actual_size); 285 utf16->resize(actual_size);
624 return true; 286 return true;
625 } 287 }
626 288
OLDNEW
« no previous file with comments | « base/i18n/icu_string_conversions.h ('k') | base/i18n/string_conversions.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698